Luigi commited on
Commit
8c3c2b9
·
1 Parent(s): c09049b

support thinking models and streamingly display thought

Browse files
Files changed (1) hide show
  1. app.py +59 -10
app.py CHANGED
@@ -4,6 +4,7 @@ import gc
4
  import threading
5
  from itertools import islice
6
  from datetime import datetime
 
7
  import gradio as gr
8
  import torch
9
  from transformers import pipeline, TextIteratorStreamer
@@ -98,7 +99,7 @@ def retrieve_context(query, max_results=6, max_chars=600):
98
  def format_conversation(history, system_prompt, tokenizer):
99
  if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
100
  messages = [{"role": "system", "content": system_prompt.strip()}] + history
101
- return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
102
  else:
103
  # Fallback for base LMs without chat template
104
  prompt = system_prompt.strip() + "\n"
@@ -178,25 +179,73 @@ def chat_response(user_msg, chat_history, system_prompt,
178
  'top_p': top_p,
179
  'repetition_penalty': repeat_penalty,
180
  'streamer': streamer,
181
- 'return_full_text': False
182
  }
183
  )
184
  gen_thread.start()
185
 
186
- assistant_text = ''
187
- # Prepare assistant placeholder
188
- history.append({'role': 'assistant', 'content': ''})
 
 
 
189
  for chunk in streamer:
190
  if cancel_event.is_set():
191
  break
192
- assistant_text += chunk
193
- history[-1]['content'] = assistant_text
194
- # Show debug only once
195
- yield history, debug # Show search results during streaming
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  gen_thread.join()
197
  yield history, debug + prompt_debug
198
  except Exception as e:
199
- history[-1]['content'] = f"Error: {e}"
200
  yield history, debug
201
  finally:
202
  gc.collect()
 
4
  import threading
5
  from itertools import islice
6
  from datetime import datetime
7
+ import re # for parsing <think> blocks
8
  import gradio as gr
9
  import torch
10
  from transformers import pipeline, TextIteratorStreamer
 
99
  def format_conversation(history, system_prompt, tokenizer):
100
  if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
101
  messages = [{"role": "system", "content": system_prompt.strip()}] + history
102
+ return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=True)
103
  else:
104
  # Fallback for base LMs without chat template
105
  prompt = system_prompt.strip() + "\n"
 
179
  'top_p': top_p,
180
  'repetition_penalty': repeat_penalty,
181
  'streamer': streamer,
182
+ 'return_full_text': False,
183
  }
184
  )
185
  gen_thread.start()
186
 
187
+ # Buffers for thought vs answer
188
+ thought_buf = ''
189
+ answer_buf = ''
190
+ in_thought = False
191
+
192
+ # Stream tokens
193
  for chunk in streamer:
194
  if cancel_event.is_set():
195
  break
196
+ text = chunk
197
+
198
+ # Detect start of thinking
199
+ if not in_thought and '<think>' in text:
200
+ in_thought = True
201
+ # Insert thought placeholder
202
+ history.append({
203
+ 'role': 'assistant',
204
+ 'content': '',
205
+ 'metadata': {'title': '💭 Thought'}
206
+ })
207
+ # Capture after opening tag
208
+ after = text.split('<think>', 1)[1]
209
+ thought_buf += after
210
+ # If closing tag in same chunk
211
+ if '</think>' in thought_buf:
212
+ before, after2 = thought_buf.split('</think>', 1)
213
+ history[-1]['content'] = before.strip()
214
+ in_thought = False
215
+ # Start answer buffer
216
+ answer_buf = after2
217
+ history.append({'role': 'assistant', 'content': answer_buf})
218
+ else:
219
+ history[-1]['content'] = thought_buf
220
+ yield history, debug
221
+ continue
222
+
223
+ # Continue thought streaming
224
+ if in_thought:
225
+ thought_buf += text
226
+ if '</think>' in thought_buf:
227
+ before, after2 = thought_buf.split('</think>', 1)
228
+ history[-1]['content'] = before.strip()
229
+ in_thought = False
230
+ # Start answer buffer
231
+ answer_buf = after2
232
+ history.append({'role': 'assistant', 'content': answer_buf})
233
+ else:
234
+ history[-1]['content'] = thought_buf
235
+ yield history, debug
236
+ continue
237
+
238
+ # Stream answer
239
+ if not answer_buf:
240
+ history.append({'role': 'assistant', 'content': ''})
241
+ answer_buf += text
242
+ history[-1]['content'] = answer_buf
243
+ yield history, debug
244
+
245
  gen_thread.join()
246
  yield history, debug + prompt_debug
247
  except Exception as e:
248
+ history.append({'role': 'assistant', 'content': f"Error: {e}"})
249
  yield history, debug
250
  finally:
251
  gc.collect()