dwb2023 commited on
Commit
0ff0123
·
verified ·
1 Parent(s): e778d3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -2
app.py CHANGED
@@ -11,6 +11,13 @@ from transformers import (
11
  TextIteratorStreamer,
12
  LlamaTokenizer,
13
  )
 
 
 
 
 
 
 
14
 
15
  MAX_MAX_NEW_TOKENS = 1024
16
  DEFAULT_MAX_NEW_TOKENS = 50
@@ -28,8 +35,7 @@ model = AutoModelForCausalLM.from_pretrained(
28
  "microsoft/Phi-3-mini-4k-instruct",
29
  device_map="cuda",
30
  torch_dtype="auto",
31
- trust_remote_code=True,
32
- attn_implementation='eager',
33
  )
34
  tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
35
 
 
11
  TextIteratorStreamer,
12
  LlamaTokenizer,
13
  )
14
+ import subprocess
15
+
16
+ subprocess.run(
17
+ "pip install flash-attn --no-build-isolation",
18
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
19
+ shell=True,
20
+ )
21
 
22
  MAX_MAX_NEW_TOKENS = 1024
23
  DEFAULT_MAX_NEW_TOKENS = 50
 
35
  "microsoft/Phi-3-mini-4k-instruct",
36
  device_map="cuda",
37
  torch_dtype="auto",
38
+ trust_remote_code=True
 
39
  )
40
  tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
41