update
Browse files
app.py
CHANGED
@@ -78,12 +78,12 @@ retriever_tool = RetrieverTool(docs_processed)
|
|
78 |
# Download gguf model files
|
79 |
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
80 |
|
81 |
-
|
82 |
hf_hub_download(
|
83 |
repo_id="mradermacher/Qwen2.5-0.5B-Rag-Thinking-i1-GGUF",
|
84 |
filename="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
|
85 |
local_dir="./models",
|
86 |
)
|
|
|
87 |
t5_size="base"
|
88 |
hf_hub_download(
|
89 |
repo_id=f"Felladrin/gguf-flan-t5-{t5_size}",
|
@@ -92,8 +92,13 @@ hf_hub_download(
|
|
92 |
)
|
93 |
|
94 |
# Set the title and description
|
95 |
-
title = "Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
|
96 |
-
description = """My Best CPU Rag Solution
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
|
99 |
|
@@ -231,9 +236,9 @@ def answer(document:str,question:str,model:str="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_
|
|
231 |
llm_model = model
|
232 |
#provider = LlamaCppPythonProvider(llm)
|
233 |
|
234 |
-
|
235 |
#answer = to_answer(provider,document,question)
|
236 |
-
return result['choices'][0]['text']
|
237 |
|
238 |
|
239 |
def respond(
|
@@ -267,7 +272,15 @@ def respond(
|
|
267 |
|
268 |
query = to_query(message)
|
269 |
document = retriever_tool(query=query)
|
270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
|
272 |
# Create a chat interface
|
273 |
demo = gr.ChatInterface(
|
@@ -287,7 +300,7 @@ demo = gr.ChatInterface(
|
|
287 |
info="Select the AI model to use for chat",visible=False
|
288 |
),
|
289 |
gr.Textbox(
|
290 |
-
value=
|
291 |
label="System Prompt",
|
292 |
info="Define the AI assistant's personality and behavior",
|
293 |
lines=2,visible=False
|
|
|
78 |
# Download gguf model files
|
79 |
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
80 |
|
|
|
81 |
hf_hub_download(
|
82 |
repo_id="mradermacher/Qwen2.5-0.5B-Rag-Thinking-i1-GGUF",
|
83 |
filename="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_K.gguf",
|
84 |
local_dir="./models",
|
85 |
)
|
86 |
+
|
87 |
t5_size="base"
|
88 |
hf_hub_download(
|
89 |
repo_id=f"Felladrin/gguf-flan-t5-{t5_size}",
|
|
|
92 |
)
|
93 |
|
94 |
# Set the title and description
|
95 |
+
title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
|
96 |
+
description = """## My Best CPU Rag Solution
|
97 |
+
- I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) server and this doesn't support new model
|
98 |
+
- search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
|
99 |
+
- Qwen2.5-0.5B as good as small-size.
|
100 |
+
- anyway google T5 series on CPU is amazing
|
101 |
+
"""
|
102 |
|
103 |
|
104 |
|
|
|
236 |
llm_model = model
|
237 |
#provider = LlamaCppPythonProvider(llm)
|
238 |
|
239 |
+
|
240 |
#answer = to_answer(provider,document,question)
|
241 |
+
#return result['choices'][0]['text']
|
242 |
|
243 |
|
244 |
def respond(
|
|
|
272 |
|
273 |
query = to_query(message)
|
274 |
document = retriever_tool(query=query)
|
275 |
+
print(document)
|
276 |
+
answer(document,message)
|
277 |
+
response = ""
|
278 |
+
#do direct in here
|
279 |
+
for chunk in llm(system_message%(document,message),max_tokens=2048*4,stream=True,top_k=top_k, top_p=top_p, temperature=temperature, repeat_penalty=repeat_penalty):
|
280 |
+
text = chunk['choices'][0]['text']
|
281 |
+
#print(text, end='', flush=True) # 逐次表示
|
282 |
+
response += text
|
283 |
+
yield response
|
284 |
|
285 |
# Create a chat interface
|
286 |
demo = gr.ChatInterface(
|
|
|
300 |
info="Select the AI model to use for chat",visible=False
|
301 |
),
|
302 |
gr.Textbox(
|
303 |
+
value=qwen_prompt,
|
304 |
label="System Prompt",
|
305 |
info="Define the AI assistant's personality and behavior",
|
306 |
lines=2,visible=False
|