Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
import torch
|
2 |
import spaces
|
3 |
import gradio as gr
|
|
|
4 |
from threading import Thread
|
5 |
from transformers import (
|
6 |
AutoModelForCausalLM,
|
@@ -11,7 +12,7 @@ from transformers import (
|
|
11 |
StoppingCriteriaList
|
12 |
)
|
13 |
|
14 |
-
MODEL_ID = "
|
15 |
|
16 |
DEFAULT_SYSTEM_PROMPT = """
|
17 |
|
@@ -61,7 +62,8 @@ def initialize_model():
|
|
61 |
device_map="cuda",
|
62 |
# quantization_config=quantization_config,
|
63 |
torch_dtype=torch.bfloat16,
|
64 |
-
trust_remote_code=True
|
|
|
65 |
)
|
66 |
model.to("cuda")
|
67 |
model.eval() # set evaluation mode to disable gradients and speed up inference
|
|
|
1 |
import torch
|
2 |
import spaces
|
3 |
import gradio as gr
|
4 |
+
import flash_attn
|
5 |
from threading import Thread
|
6 |
from transformers import (
|
7 |
AutoModelForCausalLM,
|
|
|
12 |
StoppingCriteriaList
|
13 |
)
|
14 |
|
15 |
+
MODEL_ID = "NousResearch/DeepHermes-3-Llama-3-8B-Preview"
|
16 |
|
17 |
DEFAULT_SYSTEM_PROMPT = """
|
18 |
|
|
|
62 |
device_map="cuda",
|
63 |
# quantization_config=quantization_config,
|
64 |
torch_dtype=torch.bfloat16,
|
65 |
+
trust_remote_code=True,
|
66 |
+
attn_implementation="flash_attention_2"
|
67 |
)
|
68 |
model.to("cuda")
|
69 |
model.eval() # set evaluation mode to disable gradients and speed up inference
|