Luigi commited on
Commit
9d3ca6c
·
1 Parent(s): 4522453

fix reasonning model's thought process display

Browse files
Files changed (1) hide show
  1. app.py +69 -41
app.py CHANGED
@@ -6,11 +6,34 @@ import gc
6
  import shutil
7
  import re
8
 
9
- # Set a threshold for required free storage (in bytes) before downloading a new model.
10
- # Adjust this value according to the expected size of your models.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  REQUIRED_SPACE_BYTES = 5 * 1024 ** 3 # 5 GB
12
 
13
- # Available models
14
  MODELS = {
15
  "Qwen2.5-7B-Instruct (Q2_K)": {
16
  "repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
@@ -49,7 +72,7 @@ MODELS = {
49
  },
50
  }
51
 
52
- # Sidebar for model selection and settings
53
  with st.sidebar:
54
  st.header("⚙️ Settings")
55
  selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
@@ -78,11 +101,11 @@ with st.sidebar:
78
  except Exception as e:
79
  st.error(f"Disk usage error: {e}")
80
 
81
- # Model info
82
  selected_model = MODELS[selected_model_name]
83
  model_path = os.path.join("models", selected_model["filename"])
84
 
85
- # Initialize session state variables if not present
86
  if "model_name" not in st.session_state:
87
  st.session_state.model_name = None
88
  if "llm" not in st.session_state:
@@ -92,10 +115,10 @@ if "chat_history" not in st.session_state:
92
  if "pending_response" not in st.session_state:
93
  st.session_state.pending_response = False
94
 
95
- # Ensure model directory exists
96
  os.makedirs("models", exist_ok=True)
97
 
98
- # Function to clean up old models
99
  def cleanup_old_models():
100
  for f in os.listdir("models"):
101
  if f.endswith(".gguf") and f != selected_model["filename"]:
@@ -110,7 +133,7 @@ def download_model():
110
  repo_id=selected_model["repo_id"],
111
  filename=selected_model["filename"],
112
  local_dir="./models",
113
- local_dir_use_symlinks=False,
114
  )
115
 
116
  def try_load_model(path):
@@ -130,9 +153,8 @@ def try_load_model(path):
130
  return str(e)
131
 
132
  def validate_or_download_model():
133
- # Download model if it doesn't exist locally.
134
  if not os.path.exists(model_path):
135
- # Check free space and cleanup old models only if free space is insufficient.
136
  free_space = shutil.disk_usage(".").free
137
  if free_space < REQUIRED_SPACE_BYTES:
138
  st.info("Insufficient storage detected. Cleaning up old models to free up space.")
@@ -146,7 +168,6 @@ def validate_or_download_model():
146
  os.remove(model_path)
147
  except Exception:
148
  pass
149
- # Check storage again before re-downloading.
150
  free_space = shutil.disk_usage(".").free
151
  if free_space < REQUIRED_SPACE_BYTES:
152
  st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
@@ -159,7 +180,7 @@ def validate_or_download_model():
159
  return result
160
  return result
161
 
162
- # Load model if changed
163
  if st.session_state.model_name != selected_model_name:
164
  if st.session_state.llm is not None:
165
  del st.session_state.llm
@@ -169,43 +190,38 @@ if st.session_state.model_name != selected_model_name:
169
 
170
  llm = st.session_state.llm
171
 
172
- # Display title and caption
173
  st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
174
  st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
175
 
176
- # Render the full chat history
177
  for chat in st.session_state.chat_history:
178
  with st.chat_message(chat["role"]):
179
  st.markdown(chat["content"])
180
- # For assistant messages, if there's internal reasoning, display it behind an expander
181
  if chat.get("role") == "assistant" and chat.get("thinking"):
182
  with st.expander("🧠 Model's Internal Reasoning"):
183
  for t in chat["thinking"]:
184
  st.markdown(t.strip())
185
 
186
- # Chat input widget
187
  user_input = st.chat_input("Ask something...")
188
 
189
  if user_input:
190
- # Block new input if a response is still pending
191
  if st.session_state.pending_response:
192
  st.warning("Please wait for the assistant to finish responding.")
193
  else:
194
- # Append and render the user's message
195
  st.session_state.chat_history.append({"role": "user", "content": user_input})
196
  with st.chat_message("user"):
197
  st.markdown(user_input)
198
 
199
- # Mark that we are waiting for a response
200
  st.session_state.pending_response = True
201
 
202
  MAX_TURNS = 8
203
- # Use the latest MAX_TURNS * 2 messages (system prompt plus conversation)
204
  trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
205
  messages = [{"role": "system", "content": system_prompt}] + trimmed_history
206
 
207
- # Create a container for the assistant's streaming message with two placeholders:
208
- # one for visible output and one for the think part.
209
  with st.chat_message("assistant"):
210
  visible_placeholder = st.empty()
211
  thinking_placeholder = st.empty()
@@ -219,34 +235,46 @@ if user_input:
219
  repeat_penalty=repeat_penalty,
220
  stream=True,
221
  )
222
- # Stream and update the assistant's message in real time
223
  for chunk in stream:
224
  if "choices" in chunk:
225
  delta = chunk["choices"][0]["delta"].get("content", "")
226
  full_response += delta
227
- # Update visible response by filtering out think parts
 
 
 
 
 
 
228
  visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
 
229
  visible_placeholder.markdown(visible_response)
230
- # Extract and pretty format internal reasoning (if any) while streaming
231
- thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
232
- if thinking:
233
- thinking_display = "\n\n".join(f"- {t.strip()}" for t in thinking)
234
- thinking_placeholder.markdown(f"**Internal Reasoning (in progress):**\n\n{thinking_display}")
 
 
 
 
 
 
235
  else:
236
  thinking_placeholder.empty()
237
- # After streaming completes, process the final full response:
238
- visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
239
- thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
 
 
 
 
 
240
  st.session_state.chat_history.append({
241
  "role": "assistant",
242
- "content": visible_response,
243
- "thinking": thinking
244
  })
245
- # Display the final internal reasoning behind an expander if available
246
- if thinking:
247
- with st.expander("🧠 Model's Internal Reasoning"):
248
- for t in thinking:
249
- st.markdown(t.strip())
250
 
251
- # Clear the pending flag once done
252
  st.session_state.pending_response = False
 
6
  import shutil
7
  import re
8
 
9
+ # ----- Custom CSS for pretty formatting of internal reasoning -----
10
+ CUSTOM_CSS = """
11
+ <style>
12
+ /* Styles for the internal reasoning bullet list */
13
+ ul.think-list {
14
+ margin: 0.5em 0 1em 1.5em;
15
+ padding: 0;
16
+ list-style-type: disc;
17
+ }
18
+ ul.think-list li {
19
+ margin-bottom: 0.5em;
20
+ }
21
+
22
+ /* Container style for the "in progress" internal reasoning */
23
+ .chat-assistant {
24
+ background-color: #f9f9f9;
25
+ padding: 1em;
26
+ border-radius: 5px;
27
+ margin-bottom: 1em;
28
+ }
29
+ </style>
30
+ """
31
+ st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
32
+
33
+ # ----- Set a threshold for required free storage (in bytes) -----
34
  REQUIRED_SPACE_BYTES = 5 * 1024 ** 3 # 5 GB
35
 
36
+ # ----- Available models -----
37
  MODELS = {
38
  "Qwen2.5-7B-Instruct (Q2_K)": {
39
  "repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
 
72
  },
73
  }
74
 
75
+ # ----- Sidebar settings -----
76
  with st.sidebar:
77
  st.header("⚙️ Settings")
78
  selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
 
101
  except Exception as e:
102
  st.error(f"Disk usage error: {e}")
103
 
104
+ # ----- Model info -----
105
  selected_model = MODELS[selected_model_name]
106
  model_path = os.path.join("models", selected_model["filename"])
107
 
108
+ # ----- Session state initialization -----
109
  if "model_name" not in st.session_state:
110
  st.session_state.model_name = None
111
  if "llm" not in st.session_state:
 
115
  if "pending_response" not in st.session_state:
116
  st.session_state.pending_response = False
117
 
118
+ # ----- Ensure model directory exists -----
119
  os.makedirs("models", exist_ok=True)
120
 
121
+ # ----- Functions for model management -----
122
  def cleanup_old_models():
123
  for f in os.listdir("models"):
124
  if f.endswith(".gguf") and f != selected_model["filename"]:
 
133
  repo_id=selected_model["repo_id"],
134
  filename=selected_model["filename"],
135
  local_dir="./models",
136
+ local_dir_use_symlinks=False, # Deprecated parameter; harmless warning.
137
  )
138
 
139
  def try_load_model(path):
 
153
  return str(e)
154
 
155
  def validate_or_download_model():
156
+ # Download model if not present locally.
157
  if not os.path.exists(model_path):
 
158
  free_space = shutil.disk_usage(".").free
159
  if free_space < REQUIRED_SPACE_BYTES:
160
  st.info("Insufficient storage detected. Cleaning up old models to free up space.")
 
168
  os.remove(model_path)
169
  except Exception:
170
  pass
 
171
  free_space = shutil.disk_usage(".").free
172
  if free_space < REQUIRED_SPACE_BYTES:
173
  st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
 
180
  return result
181
  return result
182
 
183
+ # ----- Load model if changed -----
184
  if st.session_state.model_name != selected_model_name:
185
  if st.session_state.llm is not None:
186
  del st.session_state.llm
 
190
 
191
  llm = st.session_state.llm
192
 
193
+ # ----- Display title and caption -----
194
  st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
195
  st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
196
 
197
+ # ----- Render full chat history -----
198
  for chat in st.session_state.chat_history:
199
  with st.chat_message(chat["role"]):
200
  st.markdown(chat["content"])
201
+ # For assistant messages, if there's completed internal reasoning, display it behind an expander.
202
  if chat.get("role") == "assistant" and chat.get("thinking"):
203
  with st.expander("🧠 Model's Internal Reasoning"):
204
  for t in chat["thinking"]:
205
  st.markdown(t.strip())
206
 
207
+ # ----- Chat input widget -----
208
  user_input = st.chat_input("Ask something...")
209
 
210
  if user_input:
 
211
  if st.session_state.pending_response:
212
  st.warning("Please wait for the assistant to finish responding.")
213
  else:
 
214
  st.session_state.chat_history.append({"role": "user", "content": user_input})
215
  with st.chat_message("user"):
216
  st.markdown(user_input)
217
 
 
218
  st.session_state.pending_response = True
219
 
220
  MAX_TURNS = 8
 
221
  trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
222
  messages = [{"role": "system", "content": system_prompt}] + trimmed_history
223
 
224
+ # ----- Streaming the assistant response -----
 
225
  with st.chat_message("assistant"):
226
  visible_placeholder = st.empty()
227
  thinking_placeholder = st.empty()
 
235
  repeat_penalty=repeat_penalty,
236
  stream=True,
237
  )
238
+
239
  for chunk in stream:
240
  if "choices" in chunk:
241
  delta = chunk["choices"][0]["delta"].get("content", "")
242
  full_response += delta
243
+
244
+ # Determine if there is an open (in-progress) <think> block
245
+ open_think = re.search(r"<think>([^<]*)$", full_response, flags=re.DOTALL)
246
+ in_progress = open_think.group(1).strip() if open_think else ""
247
+
248
+ # Create the visible response by removing any complete <think>...</think> blocks,
249
+ # and also removing any in-progress (unclosed) <think> content.
250
  visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
251
+ visible_response = re.sub(r"<think>.*$", "", visible_response, flags=re.DOTALL)
252
  visible_placeholder.markdown(visible_response)
253
+
254
+ # If there's an in-progress thinking part, display it in a pretty style
255
+ if in_progress:
256
+ # You can further format in_progress as you like; here we wrap it in a styled div.
257
+ thinking_html = f"""
258
+ <div class="chat-assistant">
259
+ <strong>Internal Reasoning (in progress):</strong>
260
+ <br>{in_progress}
261
+ </div>
262
+ """
263
+ thinking_placeholder.markdown(thinking_html, unsafe_allow_html=True)
264
  else:
265
  thinking_placeholder.empty()
266
+
267
+ # After streaming completes:
268
+ # Extract all completed <think> blocks (the final internal reasoning that was closed)
269
+ final_thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
270
+ # The final visible response: remove any <think> blocks or any in-progress open block.
271
+ final_visible = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
272
+ final_visible = re.sub(r"<think>.*$", "", final_visible, flags=re.DOTALL)
273
+
274
  st.session_state.chat_history.append({
275
  "role": "assistant",
276
+ "content": final_visible,
277
+ "thinking": final_thinking
278
  })
 
 
 
 
 
279
 
 
280
  st.session_state.pending_response = False