Akjava commited on
Commit
ad44bc4
·
1 Parent(s): 9cbac54
Files changed (1) hide show
  1. app.py +19 -21
app.py CHANGED
@@ -91,14 +91,7 @@ hf_hub_download(
91
  local_dir="./models",
92
  )
93
 
94
- # Set the title and description
95
- title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
96
- description = """## My Best CPU Rag Solution
97
- - I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) server and this doesn't support new model
98
- - search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
99
- - Qwen2.5-0.5B as good as small-size.
100
- - anyway google T5 series on CPU is amazing
101
- """
102
 
103
 
104
 
@@ -142,13 +135,12 @@ def generate_t5(llama,message):#text size must be smaller than ctx(default=512)
142
  try:
143
  tokens = llama.tokenize(f"{message}".encode("utf-8"))
144
  print(f"text length={len(tokens)}")
145
- #print(tokens)
146
  llama.encode(tokens)
147
  tokens = [llama.decoder_start_token()]
148
 
149
 
150
  outputs =""
151
- #TODO support stream
152
  iteration = 1
153
  temperature = 0.5
154
  top_k = 40
@@ -234,12 +226,6 @@ def answer(document:str,question:str,model:str="Qwen2.5-0.5B-Rag-Thinking.i1-Q6_
234
  verbose=False
235
  )
236
  llm_model = model
237
- #provider = LlamaCppPythonProvider(llm)
238
-
239
-
240
- #answer = to_answer(provider,document,question)
241
- #return result['choices'][0]['text']
242
-
243
 
244
  def respond(
245
  message: str,
@@ -276,13 +262,25 @@ def respond(
276
  answer(document,message)
277
  response = ""
278
  #do direct in here
279
- for chunk in llm(system_message%(document,message),max_tokens=2048*4,stream=True,top_k=top_k, top_p=top_p, temperature=temperature, repeat_penalty=repeat_penalty):
280
  text = chunk['choices'][0]['text']
281
- #print(text, end='', flush=True) # 逐次表示
282
  response += text
283
  yield response
284
 
 
285
  # Create a chat interface
 
 
 
 
 
 
 
 
 
 
 
 
286
  demo = gr.ChatInterface(
287
  respond,
288
  examples=[["What is the Diffuser?"], ["Tell me About Huggingface."], ["How to upload dataset?"]],
@@ -306,9 +304,9 @@ demo = gr.ChatInterface(
306
  lines=2,visible=False
307
  ),
308
  gr.Slider(
309
- minimum=512,
310
- maximum=2048,
311
- value=1024,
312
  step=1,
313
  label="Max Tokens",
314
  info="Maximum length of response (higher = longer replies)",
 
91
  local_dir="./models",
92
  )
93
 
94
+
 
 
 
 
 
 
 
95
 
96
 
97
 
 
135
  try:
136
  tokens = llama.tokenize(f"{message}".encode("utf-8"))
137
  print(f"text length={len(tokens)}")
 
138
  llama.encode(tokens)
139
  tokens = [llama.decoder_start_token()]
140
 
141
 
142
  outputs =""
143
+
144
  iteration = 1
145
  temperature = 0.5
146
  top_k = 40
 
226
  verbose=False
227
  )
228
  llm_model = model
 
 
 
 
 
 
229
 
230
  def respond(
231
  message: str,
 
262
  answer(document,message)
263
  response = ""
264
  #do direct in here
265
+ for chunk in llm(system_message%(document,message),max_tokens=max_tokens,stream=True,top_k=top_k, top_p=top_p, temperature=temperature, repeat_penalty=repeat_penalty):
266
  text = chunk['choices'][0]['text']
 
267
  response += text
268
  yield response
269
 
270
+
271
  # Create a chat interface
272
+ # Set the title and description
273
+ title = "Lhama.cpp Qwen2.5-0.5B-Rag-Thinking-Flan-T5"
274
+ description = """
275
+ - I use forked [lhamacpp-python](https://github.com/fairydreaming/llama-cpp-python/tree/t5) which support T5 on server and it's doesn't support new models(like gemma3)
276
+ - Search query generation(query reformulation) Tasks - I use flan-t5-base (large make better result,but too large for just this task)
277
+ - Qwen2.5-0.5B as good as small-size.
278
+ - anyway google T5 series on CPU is amazing
279
+ ## Huggingface Free CPU Limitation
280
+ - When duplicating a space, the build process can occasionally become stuck, requiring a manual restart to finish.
281
+ - Spaces may unexpectedly stop functioning or even be deleted, leading to the need to rework them. Refer to [issue](https://github.com/huggingface/hub-docs/issues/1633) for more information.
282
+ """
283
+
284
  demo = gr.ChatInterface(
285
  respond,
286
  examples=[["What is the Diffuser?"], ["Tell me About Huggingface."], ["How to upload dataset?"]],
 
304
  lines=2,visible=False
305
  ),
306
  gr.Slider(
307
+ minimum=1024,
308
+ maximum=8192,
309
+ value=2048,
310
  step=1,
311
  label="Max Tokens",
312
  info="Maximum length of response (higher = longer replies)",