Update app.py
Browse files
app.py
CHANGED
@@ -66,7 +66,7 @@ import multiprocessing
|
|
66 |
|
67 |
import llama_cpp
|
68 |
|
69 |
-
def
|
70 |
|
71 |
|
72 |
llama_cpp.llama_backend_init(numa=False)
|
@@ -283,6 +283,15 @@ def respond(
|
|
283 |
top_k: int,
|
284 |
repeat_penalty: float,
|
285 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
"""
|
287 |
Respond to a message using the Gemma3 model via Llama.cpp.
|
288 |
|
@@ -320,7 +329,7 @@ def respond(
|
|
320 |
llm_model = model
|
321 |
|
322 |
trans(message)
|
323 |
-
yield "done"
|
324 |
|
325 |
provider = LlamaCppPythonProvider(llm)
|
326 |
|
@@ -367,7 +376,7 @@ def respond(
|
|
367 |
outputs = ""
|
368 |
for output in stream:
|
369 |
outputs += output
|
370 |
-
yield outputs
|
371 |
|
372 |
# Handle exceptions that may occur during the process
|
373 |
except Exception as e:
|
|
|
66 |
|
67 |
import llama_cpp
|
68 |
|
69 |
+
def low_level():
|
70 |
|
71 |
|
72 |
llama_cpp.llama_backend_init(numa=False)
|
|
|
283 |
top_k: int,
|
284 |
repeat_penalty: float,
|
285 |
):
|
286 |
+
llama = Llama("models/madlad400-3b-mt-q8_0.gguf")
|
287 |
+
tokens = llama.tokenize(fb"{message}")
|
288 |
+
llama.encode(tokens)
|
289 |
+
tokens = [llama.decoder_start_token()]
|
290 |
+
for token in llama.generate(tokens, top_k=40, top_p=0.95, temp=1, repeat_penalty=1.0):
|
291 |
+
yield (llama.detokenize([token]))
|
292 |
+
if token == llama.token_eos():
|
293 |
+
break
|
294 |
+
|
295 |
"""
|
296 |
Respond to a message using the Gemma3 model via Llama.cpp.
|
297 |
|
|
|
329 |
llm_model = model
|
330 |
|
331 |
trans(message)
|
332 |
+
#yield "done"
|
333 |
|
334 |
provider = LlamaCppPythonProvider(llm)
|
335 |
|
|
|
376 |
outputs = ""
|
377 |
for output in stream:
|
378 |
outputs += output
|
379 |
+
#yield outputs
|
380 |
|
381 |
# Handle exceptions that may occur during the process
|
382 |
except Exception as e:
|