Spaces:
Running
on
Zero
Running
on
Zero
bugfix
Browse files
app.py
CHANGED
@@ -6,6 +6,10 @@ import gc
|
|
6 |
import shutil
|
7 |
import re
|
8 |
|
|
|
|
|
|
|
|
|
9 |
# Available models
|
10 |
MODELS = {
|
11 |
"Qwen2.5-7B-Instruct (Q2_K)": {
|
@@ -68,8 +72,8 @@ with st.sidebar:
|
|
68 |
if st.button("📦 Show Disk Usage"):
|
69 |
try:
|
70 |
usage = shutil.disk_usage(".")
|
71 |
-
used = usage.used / (1024**3)
|
72 |
-
free = usage.free / (1024**3)
|
73 |
st.info(f"Disk Used: {used:.2f} GB | Free: {free:.2f} GB")
|
74 |
except Exception as e:
|
75 |
st.error(f"Disk usage error: {e}")
|
@@ -78,11 +82,15 @@ with st.sidebar:
|
|
78 |
selected_model = MODELS[selected_model_name]
|
79 |
model_path = os.path.join("models", selected_model["filename"])
|
80 |
|
81 |
-
#
|
82 |
if "model_name" not in st.session_state:
|
83 |
st.session_state.model_name = None
|
84 |
if "llm" not in st.session_state:
|
85 |
st.session_state.llm = None
|
|
|
|
|
|
|
|
|
86 |
|
87 |
# Ensure model directory exists
|
88 |
os.makedirs("models", exist_ok=True)
|
@@ -107,13 +115,28 @@ def download_model():
|
|
107 |
|
108 |
def try_load_model(path):
|
109 |
try:
|
110 |
-
return Llama(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
except Exception as e:
|
112 |
return str(e)
|
113 |
|
114 |
def validate_or_download_model():
|
|
|
115 |
if not os.path.exists(model_path):
|
116 |
-
|
|
|
|
|
|
|
|
|
117 |
download_model()
|
118 |
|
119 |
result = try_load_model(model_path)
|
@@ -121,9 +144,13 @@ def validate_or_download_model():
|
|
121 |
st.warning(f"Initial load failed: {result}\nAttempting re-download...")
|
122 |
try:
|
123 |
os.remove(model_path)
|
124 |
-
except:
|
125 |
pass
|
126 |
-
|
|
|
|
|
|
|
|
|
127 |
download_model()
|
128 |
result = try_load_model(model_path)
|
129 |
if isinstance(result, str):
|
@@ -142,29 +169,46 @@ if st.session_state.model_name != selected_model_name:
|
|
142 |
|
143 |
llm = st.session_state.llm
|
144 |
|
145 |
-
#
|
146 |
-
if "chat_history" not in st.session_state:
|
147 |
-
st.session_state.chat_history = []
|
148 |
-
|
149 |
st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
|
150 |
st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
|
151 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
user_input = st.chat_input("Ask something...")
|
153 |
|
154 |
if user_input:
|
155 |
-
if
|
156 |
-
|
|
|
157 |
else:
|
|
|
158 |
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
159 |
-
|
160 |
with st.chat_message("user"):
|
161 |
st.markdown(user_input)
|
162 |
|
|
|
|
|
|
|
163 |
MAX_TURNS = 8
|
164 |
-
|
|
|
165 |
messages = [{"role": "system", "content": system_prompt}] + trimmed_history
|
166 |
|
|
|
|
|
167 |
with st.chat_message("assistant"):
|
|
|
|
|
168 |
full_response = ""
|
169 |
stream = llm.create_chat_completion(
|
170 |
messages=messages,
|
@@ -175,19 +219,34 @@ if user_input:
|
|
175 |
repeat_penalty=repeat_penalty,
|
176 |
stream=True,
|
177 |
)
|
178 |
-
|
179 |
for chunk in stream:
|
180 |
if "choices" in chunk:
|
181 |
delta = chunk["choices"][0]["delta"].get("content", "")
|
182 |
full_response += delta
|
183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
|
185 |
-
st.markdown(visible_response)
|
186 |
-
|
187 |
-
st.session_state.chat_history.append({"role": "assistant", "content": full_response})
|
188 |
-
|
189 |
thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
if thinking:
|
191 |
with st.expander("🧠 Model's Internal Reasoning"):
|
192 |
for t in thinking:
|
193 |
st.markdown(t.strip())
|
|
|
|
|
|
|
|
6 |
import shutil
|
7 |
import re
|
8 |
|
9 |
+
# Set a threshold for required free storage (in bytes) before downloading a new model.
|
10 |
+
# Adjust this value according to the expected size of your models.
|
11 |
+
REQUIRED_SPACE_BYTES = 5 * 1024 ** 3 # 5 GB
|
12 |
+
|
13 |
# Available models
|
14 |
MODELS = {
|
15 |
"Qwen2.5-7B-Instruct (Q2_K)": {
|
|
|
72 |
if st.button("📦 Show Disk Usage"):
|
73 |
try:
|
74 |
usage = shutil.disk_usage(".")
|
75 |
+
used = usage.used / (1024 ** 3)
|
76 |
+
free = usage.free / (1024 ** 3)
|
77 |
st.info(f"Disk Used: {used:.2f} GB | Free: {free:.2f} GB")
|
78 |
except Exception as e:
|
79 |
st.error(f"Disk usage error: {e}")
|
|
|
82 |
selected_model = MODELS[selected_model_name]
|
83 |
model_path = os.path.join("models", selected_model["filename"])
|
84 |
|
85 |
+
# Initialize session state variables if not present
|
86 |
if "model_name" not in st.session_state:
|
87 |
st.session_state.model_name = None
|
88 |
if "llm" not in st.session_state:
|
89 |
st.session_state.llm = None
|
90 |
+
if "chat_history" not in st.session_state:
|
91 |
+
st.session_state.chat_history = []
|
92 |
+
if "pending_response" not in st.session_state:
|
93 |
+
st.session_state.pending_response = False
|
94 |
|
95 |
# Ensure model directory exists
|
96 |
os.makedirs("models", exist_ok=True)
|
|
|
115 |
|
116 |
def try_load_model(path):
|
117 |
try:
|
118 |
+
return Llama(
|
119 |
+
model_path=path,
|
120 |
+
n_ctx=1024,
|
121 |
+
n_threads=2,
|
122 |
+
n_threads_batch=2,
|
123 |
+
n_batch=4,
|
124 |
+
n_gpu_layers=0,
|
125 |
+
use_mlock=False,
|
126 |
+
use_mmap=True,
|
127 |
+
verbose=False,
|
128 |
+
)
|
129 |
except Exception as e:
|
130 |
return str(e)
|
131 |
|
132 |
def validate_or_download_model():
|
133 |
+
# Download model if it doesn't exist locally.
|
134 |
if not os.path.exists(model_path):
|
135 |
+
# Check free space and cleanup old models only if free space is insufficient.
|
136 |
+
free_space = shutil.disk_usage(".").free
|
137 |
+
if free_space < REQUIRED_SPACE_BYTES:
|
138 |
+
st.info("Insufficient storage detected. Cleaning up old models to free up space.")
|
139 |
+
cleanup_old_models()
|
140 |
download_model()
|
141 |
|
142 |
result = try_load_model(model_path)
|
|
|
144 |
st.warning(f"Initial load failed: {result}\nAttempting re-download...")
|
145 |
try:
|
146 |
os.remove(model_path)
|
147 |
+
except Exception:
|
148 |
pass
|
149 |
+
# Check storage again before re-downloading.
|
150 |
+
free_space = shutil.disk_usage(".").free
|
151 |
+
if free_space < REQUIRED_SPACE_BYTES:
|
152 |
+
st.info("Insufficient storage detected on re-download attempt. Cleaning up old models to free up space.")
|
153 |
+
cleanup_old_models()
|
154 |
download_model()
|
155 |
result = try_load_model(model_path)
|
156 |
if isinstance(result, str):
|
|
|
169 |
|
170 |
llm = st.session_state.llm
|
171 |
|
172 |
+
# Display title and caption
|
|
|
|
|
|
|
173 |
st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
|
174 |
st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
|
175 |
|
176 |
+
# Render the full chat history
|
177 |
+
for chat in st.session_state.chat_history:
|
178 |
+
with st.chat_message(chat["role"]):
|
179 |
+
st.markdown(chat["content"])
|
180 |
+
# For assistant messages, if there's internal reasoning, display it behind an expander
|
181 |
+
if chat.get("role") == "assistant" and chat.get("thinking"):
|
182 |
+
with st.expander("🧠 Model's Internal Reasoning"):
|
183 |
+
for t in chat["thinking"]:
|
184 |
+
st.markdown(t.strip())
|
185 |
+
|
186 |
+
# Chat input widget
|
187 |
user_input = st.chat_input("Ask something...")
|
188 |
|
189 |
if user_input:
|
190 |
+
# Block new input if a response is still pending
|
191 |
+
if st.session_state.pending_response:
|
192 |
+
st.warning("Please wait for the assistant to finish responding.")
|
193 |
else:
|
194 |
+
# Append and render the user's message
|
195 |
st.session_state.chat_history.append({"role": "user", "content": user_input})
|
|
|
196 |
with st.chat_message("user"):
|
197 |
st.markdown(user_input)
|
198 |
|
199 |
+
# Mark that we are waiting for a response
|
200 |
+
st.session_state.pending_response = True
|
201 |
+
|
202 |
MAX_TURNS = 8
|
203 |
+
# Use the latest MAX_TURNS * 2 messages (system prompt plus conversation)
|
204 |
+
trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
|
205 |
messages = [{"role": "system", "content": system_prompt}] + trimmed_history
|
206 |
|
207 |
+
# Create a container for the assistant's streaming message with two placeholders:
|
208 |
+
# one for visible output and one for the think part.
|
209 |
with st.chat_message("assistant"):
|
210 |
+
visible_placeholder = st.empty()
|
211 |
+
thinking_placeholder = st.empty()
|
212 |
full_response = ""
|
213 |
stream = llm.create_chat_completion(
|
214 |
messages=messages,
|
|
|
219 |
repeat_penalty=repeat_penalty,
|
220 |
stream=True,
|
221 |
)
|
222 |
+
# Stream and update the assistant's message in real time
|
223 |
for chunk in stream:
|
224 |
if "choices" in chunk:
|
225 |
delta = chunk["choices"][0]["delta"].get("content", "")
|
226 |
full_response += delta
|
227 |
+
# Update visible response by filtering out think parts
|
228 |
+
visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
|
229 |
+
visible_placeholder.markdown(visible_response)
|
230 |
+
# Extract and pretty format internal reasoning (if any) while streaming
|
231 |
+
thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
|
232 |
+
if thinking:
|
233 |
+
thinking_display = "\n\n".join(f"- {t.strip()}" for t in thinking)
|
234 |
+
thinking_placeholder.markdown(f"**Internal Reasoning (in progress):**\n\n{thinking_display}")
|
235 |
+
else:
|
236 |
+
thinking_placeholder.empty()
|
237 |
+
# After streaming completes, process the final full response:
|
238 |
visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
|
|
|
|
|
|
|
|
|
239 |
thinking = re.findall(r"<think>(.*?)</think>", full_response, flags=re.DOTALL)
|
240 |
+
st.session_state.chat_history.append({
|
241 |
+
"role": "assistant",
|
242 |
+
"content": visible_response,
|
243 |
+
"thinking": thinking
|
244 |
+
})
|
245 |
+
# Display the final internal reasoning behind an expander if available
|
246 |
if thinking:
|
247 |
with st.expander("🧠 Model's Internal Reasoning"):
|
248 |
for t in thinking:
|
249 |
st.markdown(t.strip())
|
250 |
+
|
251 |
+
# Clear the pending flag once done
|
252 |
+
st.session_state.pending_response = False
|