update
Browse files
app.py
CHANGED
|
@@ -78,12 +78,12 @@ retriever_tool = RetrieverTool(docs_processed)
|
|
| 78 |
# Download gguf model files
|
| 79 |
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
| 80 |
|
| 81 |
-
|
| 82 |
hf_hub_download(
|
| 83 |
repo_id="mradermacher/Qwen2.5-0.5B-Rag-Thinking-i1-GGUF",
|
| 84 |
filename="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
|
| 85 |
local_dir="./models",
|
| 86 |
)
|
|
|
|
| 87 |
t5_size="base"
|
| 88 |
hf_hub_download(
|
| 89 |
repo_id=f"Felladrin/gguf-flan-t5-{t5_size}",
|
|
@@ -92,8 +92,13 @@ hf_hub_download(
|
|
| 92 |
)
|
| 93 |
|
| 94 |
# Set the title and description
|
| 95 |
-
title = "Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
|
| 96 |
-
description = """My Best CPU Rag Solution
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
|
| 99 |
|
|
@@ -231,9 +236,9 @@ def answer(document:str,question:str,model:str="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_
|
|
| 231 |
llm_model = model
|
| 232 |
#provider = LlamaCppPythonProvider(llm)
|
| 233 |
|
| 234 |
-
|
| 235 |
#answer = to_answer(provider,document,question)
|
| 236 |
-
return result['choices'][0]['text']
|
| 237 |
|
| 238 |
|
| 239 |
def respond(
|
|
@@ -267,7 +272,15 @@ def respond(
|
|
| 267 |
|
| 268 |
query = to_query(message)
|
| 269 |
document = retriever_tool(query=query)
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
# Create a chat interface
|
| 273 |
demo = gr.ChatInterface(
|
|
@@ -287,7 +300,7 @@ demo = gr.ChatInterface(
|
|
| 287 |
info="Select the AI model to use for chat",visible=False
|
| 288 |
),
|
| 289 |
gr.Textbox(
|
| 290 |
-
value=
|
| 291 |
label="System Prompt",
|
| 292 |
info="Define the AI assistant's personality and behavior",
|
| 293 |
lines=2,visible=False
|
|
|
|
| 78 |
# Download gguf model files
|
| 79 |
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
| 80 |
|
|
|
|
| 81 |
hf_hub_download(
|
| 82 |
repo_id="mradermacher/Qwen2.5-0.5B-Rag-Thinking-i1-GGUF",
|
| 83 |
filename="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
|
| 84 |
local_dir="./models",
|
| 85 |
)
|
| 86 |
+
|
| 87 |
t5_size="base"
|
| 88 |
hf_hub_download(
|
| 89 |
repo_id=f"Felladrin/gguf-flan-t5-{t5_size}",
|
|
|
|
| 92 |
)
|
| 93 |
|
| 94 |
# Set the title and description
|
| 95 |
+
title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
|
| 96 |
+
description = """## My Best CPU Rag Solution
|
| 97 |
+
- I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) server and this doesn't support new model
|
| 98 |
+
- search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
|
| 99 |
+
- Qwen2.5-0.5B as good as small-size.
|
| 100 |
+
- anyway google T5 series on CPU is amazing
|
| 101 |
+
"""
|
| 102 |
|
| 103 |
|
| 104 |
|
|
|
|
| 236 |
llm_model = model
|
| 237 |
#provider = LlamaCppPythonProvider(llm)
|
| 238 |
|
| 239 |
+
|
| 240 |
#answer = to_answer(provider,document,question)
|
| 241 |
+
#return result['choices'][0]['text']
|
| 242 |
|
| 243 |
|
| 244 |
def respond(
|
|
|
|
| 272 |
|
| 273 |
query = to_query(message)
|
| 274 |
document = retriever_tool(query=query)
|
| 275 |
+
print(document)
|
| 276 |
+
answer(document,message)
|
| 277 |
+
response = ""
|
| 278 |
+
#do direct in here
|
| 279 |
+
for chunk in llm(system_message%(document,message),max_tokens=2048*4,stream=True,top_k=top_k, top_p=top_p, temperature=temperature, repeat_penalty=repeat_penalty):
|
| 280 |
+
text = chunk['choices'][0]['text']
|
| 281 |
+
#print(text, end='', flush=True) # 逐次表示
|
| 282 |
+
response += text
|
| 283 |
+
yield response
|
| 284 |
|
| 285 |
# Create a chat interface
|
| 286 |
demo = gr.ChatInterface(
|
|
|
|
| 300 |
info="Select the AI model to use for chat",visible=False
|
| 301 |
),
|
| 302 |
gr.Textbox(
|
| 303 |
+
value=qwen_prompt,
|
| 304 |
label="System Prompt",
|
| 305 |
info="Define the AI assistant's personality and behavior",
|
| 306 |
lines=2,visible=False
|