release
Browse files
app.py
CHANGED
@@ -91,14 +91,7 @@ hf_hub_download(
|
|
91 |
local_dir="./models",
|
92 |
)
|
93 |
|
94 |
-
|
95 |
-
title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
|
96 |
-
description = """## My Best CPU Rag Solution
|
97 |
-
- I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) server and this doesn't support new model
|
98 |
-
- search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
|
99 |
-
- Qwen2.5-0.5B as good as small-size.
|
100 |
-
- anyway google T5 series on CPU is amazing
|
101 |
-
"""
|
102 |
|
103 |
|
104 |
|
@@ -142,13 +135,12 @@ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
|
|
142 |
try:
|
143 |
tokens = llama.tokenize(f"{message}".encode("utf-8"))
|
144 |
print(f"text length={len(tokens)}")
|
145 |
-
#print(tokens)
|
146 |
llama.encode(tokens)
|
147 |
tokens = [llama.decoder_start_token()]
|
148 |
|
149 |
|
150 |
outputs =""
|
151 |
-
|
152 |
iteration = 1
|
153 |
temperature = 0.5
|
154 |
top_k = 40
|
@@ -234,12 +226,6 @@ def answer(document:str,question:str,model:str="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_
|
|
234 |
verbose=False
|
235 |
)
|
236 |
llm_model = model
|
237 |
-
#provider = LlamaCppPythonProvider(llm)
|
238 |
-
|
239 |
-
|
240 |
-
#answer = to_answer(provider,document,question)
|
241 |
-
#return result['choices'][0]['text']
|
242 |
-
|
243 |
|
244 |
def respond(
|
245 |
message: str,
|
@@ -276,13 +262,25 @@ def respond(
|
|
276 |
answer(document,message)
|
277 |
response = ""
|
278 |
#do direct in here
|
279 |
-
for chunk in llm(system_message%(document,message),max_tokens=
|
280 |
text = chunk['choices'][0]['text']
|
281 |
-
#print(text, end='', flush=True) # 逐次表示
|
282 |
response += text
|
283 |
yield response
|
284 |
|
|
|
285 |
# Create a chat interface
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
demo = gr.ChatInterface(
|
287 |
respond,
|
288 |
examples=[["What is the Diffuser?"], ["Tell me About Huggingface."], ["How to upload dataset?"]],
|
@@ -306,9 +304,9 @@ demo = gr.ChatInterface(
|
|
306 |
lines=2,visible=False
|
307 |
),
|
308 |
gr.Slider(
|
309 |
-
minimum=
|
310 |
-
maximum=
|
311 |
-
value=
|
312 |
step=1,
|
313 |
label="Max Tokens",
|
314 |
info="Maximum length of response (higher = longer replies)",
|
|
|
91 |
local_dir="./models",
|
92 |
)
|
93 |
|
94 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
|
96 |
|
97 |
|
|
|
135 |
try:
|
136 |
tokens = llama.tokenize(f"{message}".encode("utf-8"))
|
137 |
print(f"text length={len(tokens)}")
|
|
|
138 |
llama.encode(tokens)
|
139 |
tokens = [llama.decoder_start_token()]
|
140 |
|
141 |
|
142 |
outputs =""
|
143 |
+
|
144 |
iteration = 1
|
145 |
temperature = 0.5
|
146 |
top_k = 40
|
|
|
226 |
verbose=False
|
227 |
)
|
228 |
llm_model = model
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
def respond(
|
231 |
message: str,
|
|
|
262 |
answer(document,message)
|
263 |
response = ""
|
264 |
#do direct in here
|
265 |
+
for chunk in llm(system_message%(document,message),max_tokens=max_tokens,stream=True,top_k=top_k, top_p=top_p, temperature=temperature, repeat_penalty=repeat_penalty):
|
266 |
text = chunk['choices'][0]['text']
|
|
|
267 |
response += text
|
268 |
yield response
|
269 |
|
270 |
+
|
271 |
# Create a chat interface
|
272 |
+
# Set the title and description
|
273 |
+
title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
|
274 |
+
description = """
|
275 |
+
- I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) which support T5 on server and it's doesn't support new models(like gemma3)
|
276 |
+
- Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
|
277 |
+
- Qwen2.5-0.5B as good as small-size.
|
278 |
+
- anyway google T5 series on CPU is amazing
|
279 |
+
## Huggingface Free CPU Limitation
|
280 |
+
- When duplicating a space, the build process can occasionally become stuck, requiring a manual restart to finish.
|
281 |
+
- Spaces may unexpectedly stop functioning or even be deleted, leading to the need to rework them. Refer to [issue](https://github.com/huggingface/hub-docs/issues/1633) for more information.
|
282 |
+
"""
|
283 |
+
|
284 |
demo = gr.ChatInterface(
|
285 |
respond,
|
286 |
examples=[["What is the Diffuser?"], ["Tell me About Huggingface."], ["How to upload dataset?"]],
|
|
|
304 |
lines=2,visible=False
|
305 |
),
|
306 |
gr.Slider(
|
307 |
+
minimum=1024,
|
308 |
+
maximum=8192,
|
309 |
+
value=2048,
|
310 |
step=1,
|
311 |
label="Max Tokens",
|
312 |
info="Maximum length of response (higher = longer replies)",
|