llamacpp-flan-t5-large-grammar-synthesis

Sleeping

Akjava commited on Mar 20

Commit

0430bc7

verified ·

1 Parent(s): 094f26e

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -28,8 +28,8 @@ os.makedirs("models",exist_ok=True)
 hf_hub_download(
-    repo_id="AnanyaPathak/t5-query-reformulation-RL-GGUF",
-    filename="t5-query-reformulation-RL-q8_0.gguf",
     local_dir="./models",
 )
@@ -85,11 +85,10 @@ def respond(
     try:
         global llama
         if llama == None:
-            model_id = "t5-query-reformulation-RL-q8_0.gguf"
             llama = Llama(f"models/{model_id}",flash_attn=False,
                         n_gpu_layers=0,
-                        #n_batch=16,#batch sometime make error
-                        n_ctx=512,
                         n_threads=2,
                         n_threads_batch=2)
@@ -97,14 +96,14 @@ def respond(
         llama.encode(tokens)
         tokens = [llama.decoder_start_token()]
         outputs =""
-        iteration = 5
         for i in range(iteration):
             for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
                 outputs+= llama.detokenize([token]).decode()
                 yield outputs
                 if token == llama.token_eos():
                     break
-            outputs+="\n"
         return outputs
     except Exception as e:
         # Custom exception handling

 hf_hub_download(
+    repo_id="pszemraj/flan-t5-large-grammar-synthesis",
+    filename="ggml-model-Q6_K.gguf",
     local_dir="./models",
 )
     try:
         global llama
         if llama == None:
+            model_id = "ggml-model-Q6_K.gguf"
             llama = Llama(f"models/{model_id}",flash_attn=False,
                         n_gpu_layers=0,
                         n_threads=2,
                         n_threads_batch=2)
         llama.encode(tokens)
         tokens = [llama.decoder_start_token()]
         outputs =""
+        iteration = 1
         for i in range(iteration):
             for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
                 outputs+= llama.detokenize([token]).decode()
                 yield outputs
                 if token == llama.token_eos():
                     break
+            #outputs+="\n"
         return outputs
     except Exception as e:
         # Custom exception handling