Luigi commited on
Commit
4522453
·
1 Parent(s): 6c77ec7
Files changed (1) hide show
  1. app.py +80 -21
app.py CHANGED
@@ -6,6 +6,10 @@ import gc
6
  import shutil
7
  import re
8
 
 
 
 
 
9
  # Available models
10
  MODELS = {
11
  "Qwen2.5-7B-Instruct (Q2_K)": {
@@ -68,8 +72,8 @@ with st.sidebar:
68
  if st.button("📦 Show Disk Usage"):
69
  try:
70
  usage = shutil.disk_usage(".")
71
- used = usage.used / (1024**3)
72
- free = usage.free / (1024**3)
73
  st.info(f"Disk Used: {used:.2f} GB | Free: {free:.2f} GB")
74
  except Exception as e:
75
  st.error(f"Disk usage error: {e}")
@@ -78,11 +82,15 @@ with st.sidebar:
78
  selected_model = MODELS[selected_model_name]
79
  model_path = os.path.join("models", selected_model["filename"])
80
 
81
- # Init state
82
  if "model_name" not in st.session_state:
83
  st.session_state.model_name = None
84
  if "llm" not in st.session_state:
85
  st.session_state.llm = None
 
 
 
 
86
 
87
  # Ensure model directory exists
88
  os.makedirs("models", exist_ok=True)
@@ -107,13 +115,28 @@ def download_model():
107
 
108
  def try_load_model(path):
109
  try:
110
- return Llama(model_path=path, n_ctx=1024, n_threads=2, n_threads_batch=2, n_batch=4, n_gpu_layers=0, use_mlock=False, use_mmap=True, verbose=False)
 
 
 
 
 
 
 
 
 
 
111
  except Exception as e:
112
  return str(e)
113
 
114
  def validate_or_download_model():
 
115
  if not os.path.exists(model_path):
116
- cleanup_old_models()
 
 
 
 
117
  download_model()
118
 
119
  result = try_load_model(model_path)
@@ -121,9 +144,13 @@ def validate_or_download_model():
121
  st.warning(f"Initial load failed: {result}\nAttempting re-download...")
122
  try:
123
  os.remove(model_path)
124
- except:
125
  pass
126
- cleanup_old_models()
 
 
 
 
127
  download_model()
128
  result = try_load_model(model_path)
129
  if isinstance(result, str):
@@ -142,29 +169,46 @@ if st.session_state.model_name != selected_model_name:
142
 
143
  llm = st.session_state.llm
144
 
145
- # Chat history state
146
- if "chat_history" not in st.session_state:
147
- st.session_state.chat_history = []
148
-
149
  st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
150
  st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
151
 
 
 
 
 
 
 
 
 
 
 
 
152
  user_input = st.chat_input("Ask something...")
153
 
154
  if user_input:
155
- if st.session_state.chat_history and st.session_state.chat_history[-1]["role"] == "user":
156
- st.warning("Please wait for the assistant to respond before sending another message.")
 
157
  else:
 
158
  st.session_state.chat_history.append({"role": "user", "content": user_input})
159
-
160
  with st.chat_message("user"):
161
  st.markdown(user_input)
162
 
 
 
 
163
  MAX_TURNS = 8
164
- trimmed_history = st.session_state.chat_history[-MAX_TURNS * 2:]
 
165
  messages = [{"role": "system", "content": system_prompt}] + trimmed_history
166
 
 
 
167
  with st.chat_message("assistant"):
 
 
168
  full_response = ""
169
  stream = llm.create_chat_completion(
170
  messages=messages,
@@ -175,19 +219,34 @@ if user_input:
175
  repeat_penalty=repeat_penalty,
176
  stream=True,
177
  )
178
-
179
  for chunk in stream:
180
  if "choices" in chunk:
181
  delta = chunk["choices"][0]["delta"].get("content", "")
182
  full_response += delta
183
-
 
 
 
 
 
 
 
 
 
 
184
  visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
185
- st.markdown(visible_response)
186
-
187
- st.session_state.chat_history.append({"role": "assistant", "content": full_response})
188
-
189
  thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
 
 
 
 
 
 
190
  if thinking:
191
  with st.expander("🧠 Model's Internal Reasoning"):
192
  for t in thinking:
193
  st.markdown(t.strip())
 
 
 
 
6
  import shutil
7
  import re
8
 
9
+ # Set a threshold for required free storage (in bytes) before downloading a new model.
10
+ # Adjust this value according to the expected size of your models.
11
+ REQUIRED_SPACE_BYTES = 5 * 1024 ** 3 # 5 GB
12
+
13
  # Available models
14
  MODELS = {
15
  "Qwen2.5-7B-Instruct (Q2_K)": {
 
72
  if st.button("📦 Show Disk Usage"):
73
  try:
74
  usage = shutil.disk_usage(".")
75
+ used = usage.used / (1024 ** 3)
76
+ free = usage.free / (1024 ** 3)
77
  st.info(f"Disk Used: {used:.2f} GB | Free: {free:.2f} GB")
78
  except Exception as e:
79
  st.error(f"Disk usage error: {e}")
 
82
  selected_model = MODELS[selected_model_name]
83
  model_path = os.path.join("models", selected_model["filename"])
84
 
85
+ # Initialize session state variables if not present
86
  if "model_name" not in st.session_state:
87
  st.session_state.model_name = None
88
  if "llm" not in st.session_state:
89
  st.session_state.llm = None
90
+ if "chat_history" not in st.session_state:
91
+ st.session_state.chat_history = []
92
+ if "pending_response" not in st.session_state:
93
+ st.session_state.pending_response = False
94
 
95
  # Ensure model directory exists
96
  os.makedirs("models", exist_ok=True)
 
115
 
116
  def try_load_model(path):
117
  try:
118
+ return Llama(
119
+ model_path=path,
120
+ n_ctx=1024,
121
+ n_threads=2,
122
+ n_threads_batch=2,
123
+ n_batch=4,
124
+ n_gpu_layers=0,
125
+ use_mlock=False,
126
+ use_mmap=True,
127
+ verbose=False,
128
+ )
129
  except Exception as e:
130
  return str(e)
131
 
132
  def validate_or_download_model():
133
+ # Download model if it doesn't exist locally.
134
  if not os.path.exists(model_path):
135
+ # Check free space and cleanup old models only if free space is insufficient.
136
+ free_space = shutil.disk_usage(".").free
137
+ if free_space < REQUIRED_SPACE_BYTES:
138
+ st.info("Insufficient storage detected. Cleaning up old models to free up space.")
139
+ cleanup_old_models()
140
  download_model()
141
 
142
  result = try_load_model(model_path)
 
144
  st.warning(f"Initial load failed: {result}\nAttempting re-download...")
145
  try:
146
  os.remove(model_path)
147
+ except Exception:
148
  pass
149
+ # Check storage again before re-downloading.
150
+ free_space = shutil.disk_usage(".").free
151
+ if free_space < REQUIRED_SPACE_BYTES:
152
+ st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
153
+ cleanup_old_models()
154
  download_model()
155
  result = try_load_model(model_path)
156
  if isinstance(result, str):
 
169
 
170
  llm = st.session_state.llm
171
 
172
+ # Display title and caption
 
 
 
173
  st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
174
  st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
175
 
176
+ # Render the full chat history
177
+ for chat in st.session_state.chat_history:
178
+ with st.chat_message(chat["role"]):
179
+ st.markdown(chat["content"])
180
+ # For assistant messages, if there's internal reasoning, display it behind an expander
181
+ if chat.get("role") == "assistant" and chat.get("thinking"):
182
+ with st.expander("🧠 Model's Internal Reasoning"):
183
+ for t in chat["thinking"]:
184
+ st.markdown(t.strip())
185
+
186
+ # Chat input widget
187
  user_input = st.chat_input("Ask something...")
188
 
189
  if user_input:
190
+ # Block new input if a response is still pending
191
+ if st.session_state.pending_response:
192
+ st.warning("Please wait for the assistant to finish responding.")
193
  else:
194
+ # Append and render the user's message
195
  st.session_state.chat_history.append({"role": "user", "content": user_input})
 
196
  with st.chat_message("user"):
197
  st.markdown(user_input)
198
 
199
+ # Mark that we are waiting for a response
200
+ st.session_state.pending_response = True
201
+
202
  MAX_TURNS = 8
203
+ # Use the latest MAX_TURNS * 2 messages (system prompt plus conversation)
204
+ trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
205
  messages = [{"role": "system", "content": system_prompt}] + trimmed_history
206
 
207
+ # Create a container for the assistant's streaming message with two placeholders:
208
+ # one for visible output and one for the think part.
209
  with st.chat_message("assistant"):
210
+ visible_placeholder = st.empty()
211
+ thinking_placeholder = st.empty()
212
  full_response = ""
213
  stream = llm.create_chat_completion(
214
  messages=messages,
 
219
  repeat_penalty=repeat_penalty,
220
  stream=True,
221
  )
222
+ # Stream and update the assistant's message in real time
223
  for chunk in stream:
224
  if "choices" in chunk:
225
  delta = chunk["choices"][0]["delta"].get("content", "")
226
  full_response += delta
227
+ # Update visible response by filtering out think parts
228
+ visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
229
+ visible_placeholder.markdown(visible_response)
230
+ # Extract and pretty format internal reasoning (if any) while streaming
231
+ thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
232
+ if thinking:
233
+ thinking_display = "\n\n".join(f"- {t.strip()}" for t in thinking)
234
+ thinking_placeholder.markdown(f"**Internal Reasoning (in progress):**\n\n{thinking_display}")
235
+ else:
236
+ thinking_placeholder.empty()
237
+ # After streaming completes, process the final full response:
238
  visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
 
 
 
 
239
  thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
240
+ st.session_state.chat_history.append({
241
+ "role": "assistant",
242
+ "content": visible_response,
243
+ "thinking": thinking
244
+ })
245
+ # Display the final internal reasoning behind an expander if available
246
  if thinking:
247
  with st.expander("🧠 Model's Internal Reasoning"):
248
  for t in thinking:
249
  st.markdown(t.strip())
250
+
251
+ # Clear the pending flag once done
252
+ st.session_state.pending_response = False