Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -98,17 +98,35 @@ def predict_chat(message: str, history: list):
|
|
98 |
prompt_input += f"Assistant: {msg['content']}\n"
|
99 |
prompt_input += "Assistant:"
|
100 |
|
101 |
-
# FIXED: Use
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
112 |
generated_text += token
|
113 |
yield generated_text
|
114 |
|
|
|
98 |
prompt_input += f"Assistant: {msg['content']}\n"
|
99 |
prompt_input += "Assistant:"
|
100 |
|
101 |
+
# FIXED: Use the correct ctransformers method - call model() directly for streaming
|
102 |
+
try:
|
103 |
+
for token in model(
|
104 |
+
prompt_input,
|
105 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
106 |
+
temperature=TEMPERATURE,
|
107 |
+
top_k=TOP_K,
|
108 |
+
top_p=TOP_P,
|
109 |
+
do_sample=DO_SAMPLE,
|
110 |
+
repetition_penalty=1.1,
|
111 |
+
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
|
112 |
+
stream=True
|
113 |
+
):
|
114 |
+
generated_text += token
|
115 |
+
yield generated_text
|
116 |
+
except Exception as e:
|
117 |
+
print(f"Error in GGUF generation: {e}")
|
118 |
+
# Fallback to non-streaming generation
|
119 |
+
output = model(
|
120 |
+
prompt_input,
|
121 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
122 |
+
temperature=TEMPERATURE,
|
123 |
+
top_k=TOP_K,
|
124 |
+
top_p=TOP_P,
|
125 |
+
do_sample=DO_SAMPLE,
|
126 |
+
repetition_penalty=1.1,
|
127 |
+
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
|
128 |
+
)
|
129 |
+
yield output
|
130 |
generated_text += token
|
131 |
yield generated_text
|
132 |
|