Qwen2.5-0.5B-Rag-Thinking-Flan-T5

Running

App Files Files Community

Akjava commited on Mar 20

Commit

96ba5d0

verified ·

1 Parent(s): 9df3361

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -7

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ from exception import CustomExceptionHandling
 cache_file = "docs_processed.joblib"
 if os.path.exists(cache_file):
     docs_processed = joblib.load(cache_file)
-    print("Loaded docs_processed from cache.")
 else:
     knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
     source_docs = [
@@ -134,7 +134,7 @@ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
         raise ValueError("llama not initialized")
     try:
         tokens = llama.tokenize(f"{message}".encode("utf-8"))
-        print(f"text length={len(tokens)}")
         llama.encode(tokens)
         tokens = [llama.decoder_start_token()]
@@ -146,7 +146,7 @@ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
         top_k = 40
         top_p = 0.95
         repeat_penalty = 1.2
-        print("stepped")
         for i in range(iteration):
             for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
                 outputs+= llama.detokenize([token]).decode()
@@ -187,7 +187,7 @@ Search Query:
         global llama
         if llama == None:
             model_id = f"flan-t5-{t5_size}.Q8_0.gguf"
-            llama = Llama(f"models/{model_id}",flash_attn=False,
                         n_gpu_layers=0,
                         n_threads=2,
                         n_threads_batch=2
@@ -258,7 +258,7 @@ def respond(
     query =  to_query(message)
     document = retriever_tool(query=query)
-    print(document)
     answer(document,message)
     response = ""
     #do direct in here
@@ -270,9 +270,9 @@ def respond(
 # Create a chat interface
 # Set the title and description
-title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
 description = """
-- I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) which support T5 on server and it's doesn't support new models(like gemma3)
 - Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
 - Qwen2.5-0.5B as good as small-size.
 - anyway google T5 series on CPU is amazing

 cache_file = "docs_processed.joblib"
 if os.path.exists(cache_file):
     docs_processed = joblib.load(cache_file)
+    #print("Loaded docs_processed from cache.")
 else:
     knowledge_base = datasets.load_dataset("m-ric/huggingface_doc", split="train")
     source_docs = [
         raise ValueError("llama not initialized")
     try:
         tokens = llama.tokenize(f"{message}".encode("utf-8"))
+        #print(f"text length={len(tokens)}")
         llama.encode(tokens)
         tokens = [llama.decoder_start_token()]
         top_k = 40
         top_p = 0.95
         repeat_penalty = 1.2
         for i in range(iteration):
             for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
                 outputs+= llama.detokenize([token]).decode()
         global llama
         if llama == None:
             model_id = f"flan-t5-{t5_size}.Q8_0.gguf"
+            llama = Llama(f"models/{model_id}",flash_attn=False,verbose=False
                         n_gpu_layers=0,
                         n_threads=2,
                         n_threads_batch=2
     query =  to_query(message)
     document = retriever_tool(query=query)
+    #print(document)
     answer(document,message)
     response = ""
     #do direct in here
 # Create a chat interface
 # Set the title and description
+title = "llama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
 description = """
+- I use forked [llama-cpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) which support T5 on server and it's doesn't support new models(like gemma3)
 - Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
 - Qwen2.5-0.5B as good as small-size.
 - anyway google T5 series on CPU is amazing