Spaces:

coqui
/

voice-chat-with-mistral

Paused

gorkemgoknar commited on Nov 13, 2023

Commit

43356c3

1 Parent(s): cd11c8a

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -156,7 +156,7 @@ from llama_cpp import Llama
 # 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
 GPU_LAYERS=int(os.environ.get("GPU_LAYERS",35))
-LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>"]
 LLAMA_VERBOSE=False
 print("Running LLM Mistral as InferenceClient")
@@ -283,7 +283,7 @@ def generate_local(
             output = ""
             for response in stream:
                 character = response.token.text
-                if "<|user|>" in character:
                     # end of context
                     return
@@ -304,7 +304,7 @@ def generate_local(
             for response in stream:
                 character= response["choices"][0]["text"]
-                if "<|user|>" in character:
                     # end of context
                     return

 # 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
 GPU_LAYERS=int(os.environ.get("GPU_LAYERS",35))
+LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>","[/INST]"]
 LLAMA_VERBOSE=False
 print("Running LLM Mistral as InferenceClient")
             output = ""
             for response in stream:
                 character = response.token.text
+                if character in LLM_STOP_WORDS:
                     # end of context
                     return
             for response in stream:
                 character= response["choices"][0]["text"]
+                if character in LLM_STOP_WORDS:
                     # end of context
                     return