Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -19,6 +19,9 @@ from typing import Tuple
|
|
19 |
#from transformers import AutoTokenizer, AutoModelForCausalLM
|
20 |
import paramiko
|
21 |
|
|
|
|
|
|
|
22 |
#os.system("chmod +x ./cusparselt.sh")
|
23 |
#os.system("./cusparselt.sh")
|
24 |
#os.system("chmod +x ./cudnn.sh")
|
@@ -117,6 +120,10 @@ def load_and_prepare_model(model_id):
|
|
117 |
add_watermarker=False,
|
118 |
use_safetensors=True,
|
119 |
).to(torch.bfloat16).to('cuda')
|
|
|
|
|
|
|
|
|
120 |
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
|
121 |
if ENABLE_CPU_OFFLOAD:
|
122 |
pipe.enable_model_cpu_offload()
|
|
|
19 |
#from transformers import AutoTokenizer, AutoModelForCausalLM
|
20 |
import paramiko
|
21 |
|
22 |
+
from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
|
23 |
+
|
24 |
+
|
25 |
#os.system("chmod +x ./cusparselt.sh")
|
26 |
#os.system("./cusparselt.sh")
|
27 |
#os.system("chmod +x ./cudnn.sh")
|
|
|
120 |
add_watermarker=False,
|
121 |
use_safetensors=True,
|
122 |
).to(torch.bfloat16).to('cuda')
|
123 |
+
pipe.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
|
124 |
+
# Workaround for not accepting attention shape using VAE for Flash Attention
|
125 |
+
pipe.vae.enable_xformers_memory_efficient_attention(attention_op=None)
|
126 |
+
|
127 |
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
|
128 |
if ENABLE_CPU_OFFLOAD:
|
129 |
pipe.enable_model_cpu_offload()
|