Spaces:
Running
on
Zero
Running
on
Zero
fix reasonning model's thought process display
Browse files
app.py
CHANGED
@@ -6,11 +6,34 @@ import gc
|
|
6 |
import shutil
|
7 |
import re
|
8 |
|
9 |
-
#
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
REQUIRED_SPACE_BYTES = 5 * 1024 ** 3 # 5 GB
|
12 |
|
13 |
-
# Available models
|
14 |
MODELS = {
|
15 |
"Qwen2.5-7B-Instruct (Q2_K)": {
|
16 |
"repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
|
@@ -49,7 +72,7 @@ MODELS = {
|
|
49 |
},
|
50 |
}
|
51 |
|
52 |
-
# Sidebar
|
53 |
with st.sidebar:
|
54 |
st.header("⚙️ Settings")
|
55 |
selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
|
@@ -78,11 +101,11 @@ with st.sidebar:
|
|
78 |
except Exception as e:
|
79 |
st.error(f"Disk usage error: {e}")
|
80 |
|
81 |
-
# Model info
|
82 |
selected_model = MODELS[selected_model_name]
|
83 |
model_path = os.path.join("models", selected_model["filename"])
|
84 |
|
85 |
-
#
|
86 |
if "model_name" not in st.session_state:
|
87 |
st.session_state.model_name = None
|
88 |
if "llm" not in st.session_state:
|
@@ -92,10 +115,10 @@ if "chat_history" not in st.session_state:
|
|
92 |
if "pending_response" not in st.session_state:
|
93 |
st.session_state.pending_response = False
|
94 |
|
95 |
-
# Ensure model directory exists
|
96 |
os.makedirs("models", exist_ok=True)
|
97 |
|
98 |
-
#
|
99 |
def cleanup_old_models():
|
100 |
for f in os.listdir("models"):
|
101 |
if f.endswith(".gguf") and f != selected_model["filename"]:
|
@@ -110,7 +133,7 @@ def download_model():
|
|
110 |
repo_id=selected_model["repo_id"],
|
111 |
filename=selected_model["filename"],
|
112 |
local_dir="./models",
|
113 |
-
local_dir_use_symlinks=False,
|
114 |
)
|
115 |
|
116 |
def try_load_model(path):
|
@@ -130,9 +153,8 @@ def try_load_model(path):
|
|
130 |
return str(e)
|
131 |
|
132 |
def validate_or_download_model():
|
133 |
-
# Download model if
|
134 |
if not os.path.exists(model_path):
|
135 |
-
# Check free space and cleanup old models only if free space is insufficient.
|
136 |
free_space = shutil.disk_usage(".").free
|
137 |
if free_space < REQUIRED_SPACE_BYTES:
|
138 |
st.info("Insufficient storage detected. Cleaning up old models to free up space.")
|
@@ -146,7 +168,6 @@ def validate_or_download_model():
|
|
146 |
os.remove(model_path)
|
147 |
except Exception:
|
148 |
pass
|
149 |
-
# Check storage again before re-downloading.
|
150 |
free_space = shutil.disk_usage(".").free
|
151 |
if free_space < REQUIRED_SPACE_BYTES:
|
152 |
st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
|
@@ -159,7 +180,7 @@ def validate_or_download_model():
|
|
159 |
return result
|
160 |
return result
|
161 |
|
162 |
-
# Load model if changed
|
163 |
if st.session_state.model_name != selected_model_name:
|
164 |
if st.session_state.llm is not None:
|
165 |
del st.session_state.llm
|
@@ -169,43 +190,38 @@ if st.session_state.model_name != selected_model_name:
|
|
169 |
|
170 |
llm = st.session_state.llm
|
171 |
|
172 |
-
# Display title and caption
|
173 |
st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
|
174 |
st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
|
175 |
|
176 |
-
# Render
|
177 |
for chat in st.session_state.chat_history:
|
178 |
with st.chat_message(chat["role"]):
|
179 |
st.markdown(chat["content"])
|
180 |
-
# For assistant messages, if there's internal reasoning, display it behind an expander
|
181 |
if chat.get("role") == "assistant" and chat.get("thinking"):
|
182 |
with st.expander("🧠 Model's Internal Reasoning"):
|
183 |
for t in chat["thinking"]:
|
184 |
st.markdown(t.strip())
|
185 |
|
186 |
-
# Chat input widget
|
187 |
user_input = st.chat_input("Ask something...")
|
188 |
|
189 |
if user_input:
|
190 |
-
# Block new input if a response is still pending
|
191 |
if st.session_state.pending_response:
|
192 |
st.warning("Please wait for the assistant to finish responding.")
|
193 |
else:
|
194 |
-
# Append and render the user's message
|
195 |
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
196 |
with st.chat_message("user"):
|
197 |
st.markdown(user_input)
|
198 |
|
199 |
-
# Mark that we are waiting for a response
|
200 |
st.session_state.pending_response = True
|
201 |
|
202 |
MAX_TURNS = 8
|
203 |
-
# Use the latest MAX_TURNS * 2 messages (system prompt plus conversation)
|
204 |
trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
|
205 |
messages = [{"role": "system", "content": system_prompt}] + trimmed_history
|
206 |
|
207 |
-
#
|
208 |
-
# one for visible output and one for the think part.
|
209 |
with st.chat_message("assistant"):
|
210 |
visible_placeholder = st.empty()
|
211 |
thinking_placeholder = st.empty()
|
@@ -219,34 +235,46 @@ if user_input:
|
|
219 |
repeat_penalty=repeat_penalty,
|
220 |
stream=True,
|
221 |
)
|
222 |
-
|
223 |
for chunk in stream:
|
224 |
if "choices" in chunk:
|
225 |
delta = chunk["choices"][0]["delta"].get("content", "")
|
226 |
full_response += delta
|
227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
|
|
|
229 |
visible_placeholder.markdown(visible_response)
|
230 |
-
|
231 |
-
|
232 |
-
if
|
233 |
-
|
234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
235 |
else:
|
236 |
thinking_placeholder.empty()
|
237 |
-
|
238 |
-
|
239 |
-
|
|
|
|
|
|
|
|
|
|
|
240 |
st.session_state.chat_history.append({
|
241 |
"role": "assistant",
|
242 |
-
"content":
|
243 |
-
"thinking":
|
244 |
})
|
245 |
-
# Display the final internal reasoning behind an expander if available
|
246 |
-
if thinking:
|
247 |
-
with st.expander("🧠 Model's Internal Reasoning"):
|
248 |
-
for t in thinking:
|
249 |
-
st.markdown(t.strip())
|
250 |
|
251 |
-
# Clear the pending flag once done
|
252 |
st.session_state.pending_response = False
|
|
|
6 |
import shutil
|
7 |
import re
|
8 |
|
9 |
+
# ----- Custom CSS for pretty formatting of internal reasoning -----
|
10 |
+
CUSTOM_CSS = """
|
11 |
+
<style>
|
12 |
+
/* Styles for the internal reasoning bullet list */
|
13 |
+
ul.think-list {
|
14 |
+
margin: 0.5em 0 1em 1.5em;
|
15 |
+
padding: 0;
|
16 |
+
list-style-type: disc;
|
17 |
+
}
|
18 |
+
ul.think-list li {
|
19 |
+
margin-bottom: 0.5em;
|
20 |
+
}
|
21 |
+
|
22 |
+
/* Container style for the "in progress" internal reasoning */
|
23 |
+
.chat-assistant {
|
24 |
+
background-color: #f9f9f9;
|
25 |
+
padding: 1em;
|
26 |
+
border-radius: 5px;
|
27 |
+
margin-bottom: 1em;
|
28 |
+
}
|
29 |
+
</style>
|
30 |
+
"""
|
31 |
+
st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
|
32 |
+
|
33 |
+
# ----- Set a threshold for required free storage (in bytes) -----
|
34 |
REQUIRED_SPACE_BYTES = 5 * 1024 ** 3 # 5 GB
|
35 |
|
36 |
+
# ----- Available models -----
|
37 |
MODELS = {
|
38 |
"Qwen2.5-7B-Instruct (Q2_K)": {
|
39 |
"repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
|
|
|
72 |
},
|
73 |
}
|
74 |
|
75 |
+
# ----- Sidebar settings -----
|
76 |
with st.sidebar:
|
77 |
st.header("⚙️ Settings")
|
78 |
selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
|
|
|
101 |
except Exception as e:
|
102 |
st.error(f"Disk usage error: {e}")
|
103 |
|
104 |
+
# ----- Model info -----
|
105 |
selected_model = MODELS[selected_model_name]
|
106 |
model_path = os.path.join("models", selected_model["filename"])
|
107 |
|
108 |
+
# ----- Session state initialization -----
|
109 |
if "model_name" not in st.session_state:
|
110 |
st.session_state.model_name = None
|
111 |
if "llm" not in st.session_state:
|
|
|
115 |
if "pending_response" not in st.session_state:
|
116 |
st.session_state.pending_response = False
|
117 |
|
118 |
+
# ----- Ensure model directory exists -----
|
119 |
os.makedirs("models", exist_ok=True)
|
120 |
|
121 |
+
# ----- Functions for model management -----
|
122 |
def cleanup_old_models():
|
123 |
for f in os.listdir("models"):
|
124 |
if f.endswith(".gguf") and f != selected_model["filename"]:
|
|
|
133 |
repo_id=selected_model["repo_id"],
|
134 |
filename=selected_model["filename"],
|
135 |
local_dir="./models",
|
136 |
+
local_dir_use_symlinks=False, # Deprecated parameter; harmless warning.
|
137 |
)
|
138 |
|
139 |
def try_load_model(path):
|
|
|
153 |
return str(e)
|
154 |
|
155 |
def validate_or_download_model():
|
156 |
+
# Download model if not present locally.
|
157 |
if not os.path.exists(model_path):
|
|
|
158 |
free_space = shutil.disk_usage(".").free
|
159 |
if free_space < REQUIRED_SPACE_BYTES:
|
160 |
st.info("Insufficient storage detected. Cleaning up old models to free up space.")
|
|
|
168 |
os.remove(model_path)
|
169 |
except Exception:
|
170 |
pass
|
|
|
171 |
free_space = shutil.disk_usage(".").free
|
172 |
if free_space < REQUIRED_SPACE_BYTES:
|
173 |
st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
|
|
|
180 |
return result
|
181 |
return result
|
182 |
|
183 |
+
# ----- Load model if changed -----
|
184 |
if st.session_state.model_name != selected_model_name:
|
185 |
if st.session_state.llm is not None:
|
186 |
del st.session_state.llm
|
|
|
190 |
|
191 |
llm = st.session_state.llm
|
192 |
|
193 |
+
# ----- Display title and caption -----
|
194 |
st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
|
195 |
st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
|
196 |
|
197 |
+
# ----- Render full chat history -----
|
198 |
for chat in st.session_state.chat_history:
|
199 |
with st.chat_message(chat["role"]):
|
200 |
st.markdown(chat["content"])
|
201 |
+
# For assistant messages, if there's completed internal reasoning, display it behind an expander.
|
202 |
if chat.get("role") == "assistant" and chat.get("thinking"):
|
203 |
with st.expander("🧠 Model's Internal Reasoning"):
|
204 |
for t in chat["thinking"]:
|
205 |
st.markdown(t.strip())
|
206 |
|
207 |
+
# ----- Chat input widget -----
|
208 |
user_input = st.chat_input("Ask something...")
|
209 |
|
210 |
if user_input:
|
|
|
211 |
if st.session_state.pending_response:
|
212 |
st.warning("Please wait for the assistant to finish responding.")
|
213 |
else:
|
|
|
214 |
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
215 |
with st.chat_message("user"):
|
216 |
st.markdown(user_input)
|
217 |
|
|
|
218 |
st.session_state.pending_response = True
|
219 |
|
220 |
MAX_TURNS = 8
|
|
|
221 |
trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
|
222 |
messages = [{"role": "system", "content": system_prompt}] + trimmed_history
|
223 |
|
224 |
+
# ----- Streaming the assistant response -----
|
|
|
225 |
with st.chat_message("assistant"):
|
226 |
visible_placeholder = st.empty()
|
227 |
thinking_placeholder = st.empty()
|
|
|
235 |
repeat_penalty=repeat_penalty,
|
236 |
stream=True,
|
237 |
)
|
238 |
+
|
239 |
for chunk in stream:
|
240 |
if "choices" in chunk:
|
241 |
delta = chunk["choices"][0]["delta"].get("content", "")
|
242 |
full_response += delta
|
243 |
+
|
244 |
+
# Determine if there is an open (in-progress) <think> block
|
245 |
+
open_think = re.search(r"<think>([^<]*)$", full_response, flags=re.DOTALL)
|
246 |
+
in_progress = open_think.group(1).strip() if open_think else ""
|
247 |
+
|
248 |
+
# Create the visible response by removing any complete <think>...</think> blocks,
|
249 |
+
# and also removing any in-progress (unclosed) <think> content.
|
250 |
visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
|
251 |
+
visible_response = re.sub(r"<think>.*$", "", visible_response, flags=re.DOTALL)
|
252 |
visible_placeholder.markdown(visible_response)
|
253 |
+
|
254 |
+
# If there's an in-progress thinking part, display it in a pretty style
|
255 |
+
if in_progress:
|
256 |
+
# You can further format in_progress as you like; here we wrap it in a styled div.
|
257 |
+
thinking_html = f"""
|
258 |
+
<div class="chat-assistant">
|
259 |
+
<strong>Internal Reasoning (in progress):</strong>
|
260 |
+
<br>{in_progress}
|
261 |
+
</div>
|
262 |
+
"""
|
263 |
+
thinking_placeholder.markdown(thinking_html, unsafe_allow_html=True)
|
264 |
else:
|
265 |
thinking_placeholder.empty()
|
266 |
+
|
267 |
+
# After streaming completes:
|
268 |
+
# Extract all completed <think> blocks (the final internal reasoning that was closed)
|
269 |
+
final_thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
|
270 |
+
# The final visible response: remove any <think> blocks or any in-progress open block.
|
271 |
+
final_visible = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
|
272 |
+
final_visible = re.sub(r"<think>.*$", "", final_visible, flags=re.DOTALL)
|
273 |
+
|
274 |
st.session_state.chat_history.append({
|
275 |
"role": "assistant",
|
276 |
+
"content": final_visible,
|
277 |
+
"thinking": final_thinking
|
278 |
})
|
|
|
|
|
|
|
|
|
|
|
279 |
|
|
|
280 |
st.session_state.pending_response = False
|