Daemontatox commited on
Commit
e4f0261
·
verified ·
1 Parent(s): 7bcad0d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -2
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import torch
2
  import spaces
3
  import gradio as gr
 
4
  from threading import Thread
5
  from transformers import (
6
  AutoModelForCausalLM,
@@ -11,7 +12,7 @@ from transformers import (
11
  StoppingCriteriaList
12
  )
13
 
14
- MODEL_ID = "cognitivecomputations/Dolphin3.0-R1-Mistral-24B"
15
 
16
  DEFAULT_SYSTEM_PROMPT = """
17
 
@@ -61,7 +62,8 @@ def initialize_model():
61
  device_map="cuda",
62
  # quantization_config=quantization_config,
63
  torch_dtype=torch.bfloat16,
64
- trust_remote_code=True
 
65
  )
66
  model.to("cuda")
67
  model.eval() # set evaluation mode to disable gradients and speed up inference
 
1
  import torch
2
  import spaces
3
  import gradio as gr
4
+ import flash_attn
5
  from threading import Thread
6
  from transformers import (
7
  AutoModelForCausalLM,
 
12
  StoppingCriteriaList
13
  )
14
 
15
+ MODEL_ID = "NousResearch/DeepHermes-3-Llama-3-8B-Preview"
16
 
17
  DEFAULT_SYSTEM_PROMPT = """
18
 
 
62
  device_map="cuda",
63
  # quantization_config=quantization_config,
64
  torch_dtype=torch.bfloat16,
65
+ trust_remote_code=True,
66
+ attn_implementation="flash_attention_2"
67
  )
68
  model.to("cuda")
69
  model.eval() # set evaluation mode to disable gradients and speed up inference