Update app.py
Browse files
app.py
CHANGED
|
@@ -33,14 +33,11 @@ huggingface_hub.login(token=LLama)
|
|
| 33 |
MODEL_ID = "meta-llama/Llama-2-7b-hf"
|
| 34 |
tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
| 35 |
|
| 36 |
-
#
|
| 37 |
-
use_flash_attention = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8
|
| 38 |
-
attn_implementation = "flash_attention_2" if use_flash_attention else "eager" # Default to eager if no compatible GPU
|
| 39 |
model = LlamaForCausalLM.from_pretrained(
|
| 40 |
MODEL_ID,
|
| 41 |
torch_dtype=torch.bfloat16,
|
| 42 |
device_map="auto",
|
| 43 |
-
attn_implementation=attn_implementation,
|
| 44 |
load_in_8bit=True
|
| 45 |
)
|
| 46 |
|
|
|
|
| 33 |
MODEL_ID = "meta-llama/Llama-2-7b-hf"
|
| 34 |
tokenizer = LlamaTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
|
| 35 |
|
| 36 |
+
# Load model with default attention mechanism (no Flash Attention)
|
|
|
|
|
|
|
| 37 |
model = LlamaForCausalLM.from_pretrained(
|
| 38 |
MODEL_ID,
|
| 39 |
torch_dtype=torch.bfloat16,
|
| 40 |
device_map="auto",
|
|
|
|
| 41 |
load_in_8bit=True
|
| 42 |
)
|
| 43 |
|