Commit
·
cd11c8a
1
Parent(s):
0207d09
Update app.py
Browse files
app.py
CHANGED
|
@@ -154,7 +154,7 @@ from llama_cpp import Llama
|
|
| 154 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
| 155 |
# else 35 full layers + XTTS works fine on T4 16GB
|
| 156 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
| 157 |
-
GPU_LAYERS=int(os.environ.get("GPU_LAYERS",
|
| 158 |
|
| 159 |
LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>"]
|
| 160 |
|
|
@@ -165,10 +165,10 @@ llm_mistral = InferenceClient("mistralai/Mistral-7B-Instruct-v0.1")
|
|
| 165 |
|
| 166 |
|
| 167 |
print("Running LLM Zephyr")
|
| 168 |
-
llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
| 169 |
|
| 170 |
print("Running Yi LLM")
|
| 171 |
-
llm_yi = Llama(model_path=yi_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
| 172 |
|
| 173 |
|
| 174 |
# Mistral formatter
|
|
|
|
| 154 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
| 155 |
# else 35 full layers + XTTS works fine on T4 16GB
|
| 156 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
| 157 |
+
GPU_LAYERS=int(os.environ.get("GPU_LAYERS",35))
|
| 158 |
|
| 159 |
LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>"]
|
| 160 |
|
|
|
|
| 165 |
|
| 166 |
|
| 167 |
print("Running LLM Zephyr")
|
| 168 |
+
llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=round(GPU_LAYERS/2),max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
| 169 |
|
| 170 |
print("Running Yi LLM")
|
| 171 |
+
llm_yi = Llama(model_path=yi_model_path,n_gpu_layers=round(GPU_LAYERS/2),max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
| 172 |
|
| 173 |
|
| 174 |
# Mistral formatter
|