Qwen2.5-0.5B-Rag-Thinking-Flan-T5

Running

App Files Files Community

Akjava commited on Mar 20

Commit

9cbac54

1 Parent(s): 0851363

update

Browse files

Files changed (1) hide show

app.py +20 -7

app.py CHANGED Viewed

@@ -78,12 +78,12 @@ retriever_tool = RetrieverTool(docs_processed)
 # Download gguf model files
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 hf_hub_download(
     repo_id="mradermacher/Qwen2.5-0.5B-Rag-Thinking-i1-GGUF",
     filename="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
     local_dir="./models",
 )
 t5_size="base"
 hf_hub_download(
     repo_id=f"Felladrin/gguf-flan-t5-{t5_size}",
@@ -92,8 +92,13 @@ hf_hub_download(
 )
 # Set the title and description
-title = "Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
-description = """My Best CPU Rag Solution"""
@@ -231,9 +236,9 @@ def answer(document:str,question:str,model:str="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_
     llm_model = model
     #provider = LlamaCppPythonProvider(llm)
-    result = llm(qwen_prompt%(document,question),max_tokens=2048*4)
     #answer = to_answer(provider,document,question)
-    return result['choices'][0]['text']
 def respond(
@@ -267,7 +272,15 @@ def respond(
     query =  to_query(message)
     document = retriever_tool(query=query)
-    return answer(document,message)
 # Create a chat interface
 demo = gr.ChatInterface(
@@ -287,7 +300,7 @@ demo = gr.ChatInterface(
             info="Select the AI model to use for chat",visible=False
         ),
         gr.Textbox(
-            value="You are a helpful assistant.",
             label="System Prompt",
             info="Define the AI assistant's personality and behavior",
             lines=2,visible=False

 # Download gguf model files
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 hf_hub_download(
     repo_id="mradermacher/Qwen2.5-0.5B-Rag-Thinking-i1-GGUF",
     filename="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
     local_dir="./models",
 )
 t5_size="base"
 hf_hub_download(
     repo_id=f"Felladrin/gguf-flan-t5-{t5_size}",
 )
 # Set the title and description
+title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
+description = """## My Best CPU Rag Solution
+- I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) server and this doesn't support new model
+- search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
+- Qwen2.5-0.5B as good as small-size.
+- anyway google T5 series on CPU is amazing
+"""
     llm_model = model
     #provider = LlamaCppPythonProvider(llm)
     #answer = to_answer(provider,document,question)
+    #return result['choices'][0]['text']
 def respond(
     query =  to_query(message)
     document = retriever_tool(query=query)
+    print(document)
+    answer(document,message)
+    response = ""
+    #do direct in here
+    for chunk in  llm(system_message%(document,message),max_tokens=2048*4,stream=True,top_k=top_k, top_p=top_p, temperature=temperature, repeat_penalty=repeat_penalty):
+        text = chunk['choices'][0]['text']
+        #print(text, end='', flush=True)  # 逐次表示
+        response += text
+        yield response
 # Create a chat interface
 demo = gr.ChatInterface(
             info="Select the AI model to use for chat",visible=False
         ),
         gr.Textbox(
+            value=qwen_prompt,
             label="System Prompt",
             info="Define the AI assistant's personality and behavior",
             lines=2,visible=False