Update app.py
Browse files
app.py
CHANGED
@@ -82,8 +82,8 @@ def respond(
|
|
82 |
if llama == None:
|
83 |
llama = Llama("models/t5-query-reformulation-RL-q8_0.gguf",flash_attn=False,
|
84 |
n_gpu_layers=0,
|
85 |
-
n_batch=
|
86 |
-
n_ctx=
|
87 |
n_threads=2,
|
88 |
n_threads_batch=2)
|
89 |
|
@@ -91,11 +91,14 @@ def respond(
|
|
91 |
llama.encode(tokens)
|
92 |
tokens = [llama.decoder_start_token()]
|
93 |
outputs =""
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
99 |
return outputs
|
100 |
except Exception as e:
|
101 |
# Custom exception handling
|
|
|
82 |
if llama == None:
|
83 |
llama = Llama("models/t5-query-reformulation-RL-q8_0.gguf",flash_attn=False,
|
84 |
n_gpu_layers=0,
|
85 |
+
n_batch=32,
|
86 |
+
n_ctx=512,
|
87 |
n_threads=2,
|
88 |
n_threads_batch=2)
|
89 |
|
|
|
91 |
llama.encode(tokens)
|
92 |
tokens = [llama.decoder_start_token()]
|
93 |
outputs =""
|
94 |
+
iteration = 5
|
95 |
+
for i in range(iteration):
|
96 |
+
for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
|
97 |
+
outputs+= llama.detokenize([token]).decode()
|
98 |
+
yield outputs
|
99 |
+
if token == llama.token_eos():
|
100 |
+
break
|
101 |
+
outputs+="\n"
|
102 |
return outputs
|
103 |
except Exception as e:
|
104 |
# Custom exception handling
|