Commit
·
43356c3
1
Parent(s):
cd11c8a
Update app.py
Browse files
app.py
CHANGED
@@ -156,7 +156,7 @@ from llama_cpp import Llama
|
|
156 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
157 |
GPU_LAYERS=int(os.environ.get("GPU_LAYERS",35))
|
158 |
|
159 |
-
LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>"]
|
160 |
|
161 |
LLAMA_VERBOSE=False
|
162 |
print("Running LLM Mistral as InferenceClient")
|
@@ -283,7 +283,7 @@ def generate_local(
|
|
283 |
output = ""
|
284 |
for response in stream:
|
285 |
character = response.token.text
|
286 |
-
if
|
287 |
# end of context
|
288 |
return
|
289 |
|
@@ -304,7 +304,7 @@ def generate_local(
|
|
304 |
for response in stream:
|
305 |
character= response["choices"][0]["text"]
|
306 |
|
307 |
-
if
|
308 |
# end of context
|
309 |
return
|
310 |
|
|
|
156 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
157 |
GPU_LAYERS=int(os.environ.get("GPU_LAYERS",35))
|
158 |
|
159 |
+
LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>","[/INST]"]
|
160 |
|
161 |
LLAMA_VERBOSE=False
|
162 |
print("Running LLM Mistral as InferenceClient")
|
|
|
283 |
output = ""
|
284 |
for response in stream:
|
285 |
character = response.token.text
|
286 |
+
if character in LLM_STOP_WORDS:
|
287 |
# end of context
|
288 |
return
|
289 |
|
|
|
304 |
for response in stream:
|
305 |
character= response["choices"][0]["text"]
|
306 |
|
307 |
+
if character in LLM_STOP_WORDS:
|
308 |
# end of context
|
309 |
return
|
310 |
|