Qwen2.5-0.5B-Rag-Thinking-Flan-T5

Sleeping

App Files Files Community

Akjava commited on 22 days ago

Commit

29d67eb

verified ·

1 Parent(s): 22ce07c

Update app.py

Browse files

Files changed (1) hide show

app.py +100 -140

app.py CHANGED Viewed

@@ -12,28 +12,13 @@ import subprocess
 import sys
 import joblib
 from llama_cpp import Llama
-from llama_cpp_agent import LlamaCppAgent
-from llama_cpp_agent import MessagesFormatterType
-from llama_cpp_agent.providers import LlamaCppPythonProvider
-from llama_cpp_agent.chat_history import BasicChatHistory
-from llama_cpp_agent.chat_history.messages import Roles
 import gradio as gr
 from huggingface_hub import hf_hub_download
 from typing import List, Tuple,Dict,Optional
 from logger import logging
 from exception import CustomExceptionHandling
-from smolagents.gradio_ui import GradioUI
-from smolagents import (
-    CodeAgent,
-    GoogleSearchTool,
-    Model,
-    Tool,
-    LiteLLMModel,
-    ToolCallingAgent,
-    ChatMessage,tool,MessageRole
-)
 cache_file = "docs_processed.joblib"
 if os.path.exists(cache_file):
     docs_processed = joblib.load(cache_file)
@@ -91,24 +76,25 @@ retriever_tool = RetrieverTool(docs_processed)
 # Download gguf model files
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 hf_hub_download(
-    repo_id="bartowski/google_gemma-3-4b-it-GGUF",
-    filename="google_gemma-3-4b-it-Q4_K_M.gguf",
     local_dir="./models",
 )
 hf_hub_download(
-    repo_id="bartowski/google_gemma-3-1b-it-GGUF",
-    filename="google_gemma-3-1b-it-Q5_K_M.gguf",
     local_dir="./models",
 )
 # Set the title and description
-title = "Gemma3-4B llama.cpp on cpu rag"
-description = """This is prompt version rag.\n fast and stable than [smolagent version](https://huggingface.co/spaces/Akjava/Gemma3-1B-llamacpp-cpu-rag-smolagents).but the prompt needs significant improvement."""
-llm = None
-llm_model = None
 query_system = """
@@ -140,38 +126,101 @@ Search Query: transformer model history
 def clean_text(text):
     cleaned = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII chars
     cleaned = re.sub(r'[^a-zA-Z0-9_\- ]', '', cleaned) #Then your original rule
     return cleaned
-def to_query(provider,question):
     try:
-        query_agent = LlamaCppAgent(
-                provider,
-                system_prompt=f"{query_system}",
-                predefined_messages_formatter_type=MessagesFormatterType.GEMMA_2,
-                debug_output=False,
-            )
-        message="""
 Now, rewrite the following question:
 User Question: %s
 Search Query:
-    """%question
-        settings = provider.get_provider_default_settings()
-        messages = BasicChatHistory()
-        result = query_agent.get_chat_response(
-                message,
-                llm_sampling_settings=settings,
-                chat_history=messages,
-                returns_streaming_generator=False,
-                print_output=False,
-            )
-        return clean_text(result)
     except Exception as e:
         # Custom exception handling
         raise CustomExceptionHandling(e, sys) from e
 def respond(
     message: str,
@@ -186,7 +235,6 @@ def respond(
 ):
     """
     Respond to a message using the Gemma3 model via Llama.cpp.
     Args:
         - message (str): The message to respond to.
         - history (List[Tuple[str, str]]): The chat history.
@@ -197,101 +245,13 @@ def respond(
         - top_p (float): The top-p of the model.
         - top_k (int): The top-k of the model.
         - repeat_penalty (float): The repetition penalty of the model.
     Returns:
         str: The response to the message.
     """
     if model is None:#
         return
-    try:
-        # Load the global variables
-        global llm
-        global llm_model
-        # Load the model
-        if llm is None or llm_model != model:
-            llm = Llama(
-                model_path=f"models/{model}",
-                flash_attn=False,
-                n_gpu_layers=0,
-                n_batch=16,
-                n_ctx=2048,
-                n_threads=2,
-                n_threads_batch=2,
-                verbose=False
-            )
-            llm_model = model
-        provider = LlamaCppPythonProvider(llm)
-        query = to_query(provider,message)
-        text = retriever_tool(query=f"{query}")
-        #very sensitive against prompt
-        retriever_system="""
-        You are an AI assistant that answers questions based on below retrievered documents.
-Documents:
----
-%s
----
-Question: %s
-Answer:
-        """ % (text,message)
-        # Create the agent
-        agent = LlamaCppAgent(
-            provider,
-            #system_prompt=f"{retriever_system}",
-            system_prompt="you are kind assistant",
-            predefined_messages_formatter_type=MessagesFormatterType.GEMMA_2,
-            debug_output=False,
-        )
-        # Set the settings like temperature, top-k, top-p, max tokens, etc.
-        settings = provider.get_provider_default_settings()
-        settings.temperature = temperature
-        settings.top_k = top_k
-        settings.top_p = top_p
-        settings.max_tokens = max_tokens
-        settings.repeat_penalty = repeat_penalty
-        settings.stream = True
-        messages = BasicChatHistory()
-        # Add the chat history
-        for msn in history:
-            user = {"role": Roles.user, "content": msn[0]}
-            assistant = {"role": Roles.assistant, "content": msn[1]}
-            messages.add_message(user)
-            messages.add_message(assistant)
-        # Get the response stream
-        stream = agent.get_chat_response(
-            retriever_system,
-            #retriever_system+text,
-            #retriever_system+text,
-            llm_sampling_settings=settings,
-            chat_history=messages,
-            returns_streaming_generator=True,
-            print_output=False,
-        )
-        # Log the success
-        logging.info("Response stream generated successfully")
-        # Generate the response
-        outputs = ""
-        for output in stream:
-            outputs += output
-            yield outputs
-    # Handle exceptions that may occur during the process
-    except Exception as e:
-        # Custom exception handling
-        raise CustomExceptionHandling(e, sys) from e
 # Create a chat interface
 demo = gr.ChatInterface(
@@ -303,12 +263,12 @@ demo = gr.ChatInterface(
     additional_inputs=[
         gr.Dropdown(
             choices=[
-                "google_gemma-3-4b-it-Q4_K_M.gguf",
-                "google_gemma-3-1b-it-Q5_K_M.gguf",
             ],
-            value="google_gemma-3-4b-it-Q4_K_M.gguf",
             label="Model",
-            info="Select the AI model to use for chat",
         ),
         gr.Textbox(
             value="You are a helpful assistant.",

 import sys
 import joblib
 from llama_cpp import Llama
 import gradio as gr
 from huggingface_hub import hf_hub_download
 from typing import List, Tuple,Dict,Optional
 from logger import logging
 from exception import CustomExceptionHandling
 cache_file = "docs_processed.joblib"
 if os.path.exists(cache_file):
     docs_processed = joblib.load(cache_file)
 # Download gguf model files
 huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 hf_hub_download(
+    repo_id="mradermacher/Qwen2.5-0.5B-Rag-Thinking-i1-GGUF",
+    filename="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
     local_dir="./models",
 )
+t5_size="base"
 hf_hub_download(
+    repo_id=f"Felladrin/gguf-flan-t5-{t5_size}",
+    filename=f"flan-t5-{size}.Q8_0.gguf",
     local_dir="./models",
 )
 # Set the title and description
+title = "Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
+description = """My Best CPU Rag Solution"""
 query_system = """
 def clean_text(text):
     cleaned = re.sub(r'[^\x00-\x7F]+', '', text)  # Remove non-ASCII chars
     cleaned = re.sub(r'[^a-zA-Z0-9_\- ]', '', cleaned) #Then your original rule
+    cleaned = cleaned.replace("---","")
     return cleaned
+def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
+    if llama == None:
+        raise ValueError("llama not initialized")
     try:
+        tokens = llama.tokenize(f"{message}".encode("utf-8"))
+        print(f"text length={len(tokens)}")
+        #print(tokens)
+        llama.encode(tokens)
+        tokens = [llama.decoder_start_token()]
+        outputs =""
+        #TODO support stream
+        iteration = 1
+        temperature = 0.5
+        top_k = 40
+        top_p = 0.95
+        repeat_penalty = 1.2
+        print("stepped")
+        for i in range(iteration):
+            for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
+                outputs+= llama.detokenize([token]).decode()
+                if token == llama.token_eos():
+                    break
+        return outputs
+    except Exception as e:
+        raise CustomExceptionHandling(e, sys) from e
+    return None
+def to_query(question):
+    system = """
+You are a query rewriter. Your task is to convert a user's question into a concise search query suitable for information retrieval.
+The goal is to identify the most important keywords for a search engine.
+Here are some examples:
+User Question: What is transformer?
+Search Query: transformer
+User Question: How does a transformer model work in natural language processing?
+Search Query: transformer model natural language processing
+User Question: What are the advantages of using transformers over recurrent neural networks?
+Search Query: transformer vs recurrent neural network advantages
+User Question: Explain the attention mechanism in transformers.
+Search Query: transformer attention mechanism
+User Question: What are the different types of transformer architectures?
+Search Query: transformer architectures
+User Question: What is the history of the transformer model?
+Search Query: transformer model history
+---
 Now, rewrite the following question:
 User Question: %s
 Search Query:
+"""% question
+    message = system
+    try:
+        global llama
+        if llama == None:
+            model_id = f"flan-t5-{t5_size}.Q8_0.gguf"
+            llama = Llama(f"models/{model_id}",flash_attn=False,
+                        n_gpu_layers=0,
+                        n_threads=2,
+                        n_threads_batch=2
+                        )
+        query = generate_t5(llama,message)
+        return clean_text(query)
     except Exception as e:
         # Custom exception handling
         raise CustomExceptionHandling(e, sys) from e
+    return None
+def answer(document:str,question:str,model:str="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf")->str:
+    global llm
+    global llm_model
+    global provider
+    llm = Llama(
+                model_path=f"models/{model}",
+                flash_attn=False,
+                n_gpu_layers=0,
+                n_batch=1024,
+                n_ctx=2048*4,
+                n_threads=2,
+                n_threads_batch=2,
+                verbose=False
+            )
+    llm_model = model
+    #provider = LlamaCppPythonProvider(llm)
+    result = llm(qwen_prompt%(document,question),max_tokens=2048*4)
+    #answer = to_answer(provider,document,question)
+    return result['choices'][0]['text']
 def respond(
     message: str,
 ):
     """
     Respond to a message using the Gemma3 model via Llama.cpp.
     Args:
         - message (str): The message to respond to.
         - history (List[Tuple[str, str]]): The chat history.
         - top_p (float): The top-p of the model.
         - top_k (int): The top-k of the model.
         - repeat_penalty (float): The repetition penalty of the model.
     Returns:
         str: The response to the message.
     """
     if model is None:#
         return
+    return to_query(message)
 # Create a chat interface
 demo = gr.ChatInterface(
     additional_inputs=[
         gr.Dropdown(
             choices=[
+                "Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
             ],
+            value="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
             label="Model",
+            info="Select the AI model to use for chat",visible=False
         ),
         gr.Textbox(
             value="You are a helpful assistant.",