Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running on Zero

App Files Files Community

Luigi commited on Apr 10

Commit

248f5a7

1 Parent(s): a7fdfe6

Code simplification

Browse files

Files changed (1) hide show

app.py +97 -115

app.py CHANGED Viewed

@@ -1,61 +1,49 @@
 import streamlit as st
 from llama_cpp import Llama
 from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
 from huggingface_hub import hf_hub_download
-import os, gc, shutil, re
-from itertools import islice
-from duckduckgo_search import DDGS  # Latest class-based interface :contentReference[oaicite:0]{index=0}
-# ----- Custom CSS for pretty formatting of internal reasoning -----
-CUSTOM_CSS = """
-<style>
-/* Styles for the internal reasoning bullet list */
-ul.think-list {
-    margin: 0.5em 0 1em 1.5em;
-    padding: 0;
-    list-style-type: disc;
-}
-ul.think-list li {
-    margin-bottom: 0.5em;
-}
-/* Container style for the "in progress" internal reasoning */
-.chat-assistant {
-    background-color: #f9f9f9;
-    padding: 1em;
-    border-radius: 5px;
-    margin-bottom: 1em;
-}
 </style>
-"""
-st.markdown(CUSTOM_CSS, unsafe_allow_html=True)
-# ----- Set a threshold for required free storage (in bytes) -----
 REQUIRED_SPACE_BYTES = 5 * 1024 ** 3  # 5 GB
-# ----- Function to perform DuckDuckGo search and retrieve concise context -----
 def retrieve_context(query, max_results=2, max_chars_per_result=150):
-    """
-    Query DuckDuckGo for the given search query and return a concatenated context string.
-    Uses the DDGS().text() generator (with region, safesearch, and timelimit parameters)
-    and limits the results using islice. Each result's title and snippet are combined into context.
-    """
     try:
         with DDGS() as ddgs:
-            results_gen = ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y")
-            results = list(islice(results_gen, max_results))
             context = ""
-            if results:
-                for i, result in enumerate(results, start=1):
-                    title = result.get("title", "No Title")
-                    snippet = result.get("body", "")[:max_chars_per_result]
-                    context += f"Result {i}:\nTitle: {title}\nSnippet: {snippet}\n\n"
             return context.strip()
     except Exception as e:
         st.error(f"Error during retrieval: {e}")
         return ""
-# ----- Available models -----
 MODELS = {
     "Qwen2.5-0.5B-Instruct (Q4_K_M)": {
         "repo_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
@@ -109,40 +97,26 @@ with st.sidebar:
     st.header("⚙️ Settings")
     selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
     system_prompt_base = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
-    max_tokens = st.slider("Max tokens", 64, 1024, 256, step=32)  # Adjust for lower memory usage
     temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
     top_k = st.slider("Top-K", 1, 100, 40)
     top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
     repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
-    # Checkbox to enable the DuckDuckGo search feature (disabled by default)
     enable_search = st.checkbox("Enable Web Search", value=False)
-    if st.button("📦 Show Disk Usage"):
-        try:
-            usage = shutil.disk_usage(".")
-            used = usage.used / (1024 ** 3)
-            free = usage.free / (1024 ** 3)
-            st.info(f"Disk Used: {used:.2f} GB | Free: {free:.2f} GB")
-        except Exception as e:
-            st.error(f"Disk usage error: {e}")
-# ----- Define selected model and path -----
 selected_model = MODELS[selected_model_name]
 model_path = os.path.join("models", selected_model["filename"])
-# Ensure model directory exists
 os.makedirs("models", exist_ok=True)
-# ----- Helper functions for model management -----
 def try_load_model(path):
     try:
         return Llama(
             model_path=path,
-            n_ctx=512,           # Reduced context window to save memory
-            n_threads=2,         # Fewer threads for resource-constrained environments
             n_threads_batch=1,
-            n_batch=64,           # Lower batch size to conserve memory
             n_gpu_layers=0,
             use_mlock=False,
             use_mmap=True,
@@ -164,13 +138,12 @@ def download_model():
 def validate_or_download_model():
     if not os.path.exists(model_path):
-        free_space = shutil.disk_usage(".").free
-        if free_space < REQUIRED_SPACE_BYTES:
             st.info("Insufficient storage. Consider cleaning up old models.")
         download_model()
     result = try_load_model(model_path)
     if isinstance(result, str):
-        st.warning(f"Initial load failed: {result}\nAttempting re-download...")
         try:
             os.remove(model_path)
         except Exception:
@@ -180,20 +153,8 @@ def validate_or_download_model():
         if isinstance(result, str):
             st.error(f"Model still failed after re-download: {result}")
             st.stop()
-        return result
     return result
-# ----- Session state initialization -----
-if "model_name" not in st.session_state:
-    st.session_state.model_name = None
-if "llm" not in st.session_state:
-    st.session_state.llm = None
-if "chat_history" not in st.session_state:
-    st.session_state.chat_history = []
-if "pending_response" not in st.session_state:
-    st.session_state.pending_response = False
-# ----- Load model if changed -----
 if st.session_state.model_name != selected_model_name:
     if st.session_state.llm is not None:
         del st.session_state.llm
@@ -203,40 +164,32 @@ if st.session_state.model_name != selected_model_name:
 llm = st.session_state.llm
-# ----- Display title and caption -----
 st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
 st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
-# Render existing chat history
 for chat in st.session_state.chat_history:
     with st.chat_message(chat["role"]):
         st.markdown(chat["content"])
-# ----- Chat input and integrated RAG with memory optimizations -----
 user_input = st.chat_input("Ask something...")
 if user_input:
     if st.session_state.pending_response:
         st.warning("Please wait for the assistant to finish responding.")
     else:
-        # Display the raw user input immediately in the chat view.
         with st.chat_message("user"):
             st.markdown(user_input)
-        # Append the plain user message to chat history for display purposes.
-        # (We will later override the last user message in the API call with the augmented version.)
         st.session_state.chat_history.append({"role": "user", "content": user_input})
         st.session_state.pending_response = True
-        # Retrieve extra context from web search if enabled
-        if enable_search:
-            retrieved_context = retrieve_context(user_input, max_results=2, max_chars_per_result=150)
-        else:
-            retrieved_context = ""
         st.sidebar.markdown("### Retrieved Context" if enable_search else "Web Search Disabled")
         st.sidebar.text(retrieved_context or "No context found.")
-        # Build an augmented user query by merging the system prompt (and search context when available)
         if enable_search and retrieved_context:
             augmented_user_input = (
                 f"{system_prompt_base.strip()}\n\n"
@@ -247,39 +200,68 @@ if user_input:
         else:
             augmented_user_input = f"{system_prompt_base.strip()}\n\nUser Query: {user_input}"
-        # Limit conversation history to the last MAX_TURNS turns (user/assistant pairs)
         MAX_TURNS = 2
         trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
-        # Replace the last user message (which is plain) with the augmented version for model input.
         if trimmed_history and trimmed_history[-1]["role"] == "user":
             messages = trimmed_history[:-1] + [{"role": "user", "content": augmented_user_input}]
         else:
             messages = trimmed_history + [{"role": "user", "content": augmented_user_input}]
-        # Generate response with the LLM in a streaming fashion
-        with st.chat_message("assistant"):
-            visible_placeholder = st.empty()
-            full_response = ""
-            stream = llm.create_chat_completion(
-                messages=messages,
-                max_tokens=max_tokens,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                repeat_penalty=repeat_penalty,
-                stream=True,
-            )
-            for chunk in stream:
-                if "choices" in chunk:
-                    delta = chunk["choices"][0]["delta"].get("content", "")
-                    full_response += delta
-                    # Clean internal reasoning markers before display
-                    visible_response = re.sub(r"<think>.*?</think>", "", full_response, flags=re.DOTALL)
-                    visible_response = re.sub(r"<think>.*$", "", visible_response, flags=re.DOTALL)
-                    visible_placeholder.markdown(visible_response)
-        # Append the assistant's response to conversation history.
-        st.session_state.chat_history.append({"role": "assistant", "content": full_response})
         st.session_state.pending_response = False
-        gc.collect()  # Free memory

 import streamlit as st
+import os, gc, shutil, re, time, threading, queue
+from itertools import islice
 from llama_cpp import Llama
 from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
 from huggingface_hub import hf_hub_download
+from duckduckgo_search import DDGS
+# ---- Initialize session state ----
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+if "pending_response" not in st.session_state:
+    st.session_state.pending_response = False
+if "model_name" not in st.session_state:
+    st.session_state.model_name = None
+if "llm" not in st.session_state:
+    st.session_state.llm = None
+# ---- Custom CSS ----
+st.markdown("""
+<style>
+ul.think-list { margin: 0.5em 0 1em 1.5em; padding: 0; list-style-type: disc; }
+ul.think-list li { margin-bottom: 0.5em; }
+.chat-assistant { background-color: #f9f9f9; padding: 1em; border-radius: 5px; margin-bottom: 1em; }
 </style>
+""", unsafe_allow_html=True)
+# ---- Required storage space ----
 REQUIRED_SPACE_BYTES = 5 * 1024 ** 3  # 5 GB
+# ---- Function to retrieve web search context ----
 def retrieve_context(query, max_results=2, max_chars_per_result=150):
     try:
         with DDGS() as ddgs:
+            results = list(islice(ddgs.text(query, region="wt-wt", safesearch="off", timelimit="y"), max_results))
             context = ""
+            for i, result in enumerate(results, start=1):
+                title = result.get("title", "No Title")
+                snippet = result.get("body", "")[:max_chars_per_result]
+                context += f"Result {i}:\nTitle: {title}\nSnippet: {snippet}\n\n"
             return context.strip()
     except Exception as e:
         st.error(f"Error during retrieval: {e}")
         return ""
+# ---- Model definitions ----
 MODELS = {
     "Qwen2.5-0.5B-Instruct (Q4_K_M)": {
         "repo_id": "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
     st.header("⚙️ Settings")
     selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
     system_prompt_base = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
+    max_tokens = st.slider("Max tokens", 64, 1024, 256, step=32)
     temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
     top_k = st.slider("Top-K", 1, 100, 40)
     top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
     repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
     enable_search = st.checkbox("Enable Web Search", value=False)
+# ---- Define selected model and manage its download/load ----
 selected_model = MODELS[selected_model_name]
 model_path = os.path.join("models", selected_model["filename"])
 os.makedirs("models", exist_ok=True)
 def try_load_model(path):
     try:
         return Llama(
             model_path=path,
+            n_ctx=512,  # Reduced context window
+            n_threads=2,
             n_threads_batch=1,
+            n_batch=64,
             n_gpu_layers=0,
             use_mlock=False,
             use_mmap=True,
 def validate_or_download_model():
     if not os.path.exists(model_path):
+        if shutil.disk_usage(".").free < REQUIRED_SPACE_BYTES:
             st.info("Insufficient storage. Consider cleaning up old models.")
         download_model()
     result = try_load_model(model_path)
     if isinstance(result, str):
+        st.warning(f"Initial load failed: {result}\nRe-downloading...")
         try:
             os.remove(model_path)
         except Exception:
         if isinstance(result, str):
             st.error(f"Model still failed after re-download: {result}")
             st.stop()
     return result
 if st.session_state.model_name != selected_model_name:
     if st.session_state.llm is not None:
         del st.session_state.llm
 llm = st.session_state.llm
+# ---- Display title and existing chat history ----
 st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
 st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
 for chat in st.session_state.chat_history:
     with st.chat_message(chat["role"]):
         st.markdown(chat["content"])
+# ---- Chat input and processing ----
 user_input = st.chat_input("Ask something...")
 if user_input:
     if st.session_state.pending_response:
         st.warning("Please wait for the assistant to finish responding.")
     else:
+        # Display user input and update chat history
         with st.chat_message("user"):
             st.markdown(user_input)
         st.session_state.chat_history.append({"role": "user", "content": user_input})
         st.session_state.pending_response = True
+        # Optionally retrieve extra context
+        retrieved_context = retrieve_context(user_input, max_results=2, max_chars_per_result=150) if enable_search else ""
         st.sidebar.markdown("### Retrieved Context" if enable_search else "Web Search Disabled")
         st.sidebar.text(retrieved_context or "No context found.")
+        # Build augmented query
         if enable_search and retrieved_context:
             augmented_user_input = (
                 f"{system_prompt_base.strip()}\n\n"
         else:
             augmented_user_input = f"{system_prompt_base.strip()}\n\nUser Query: {user_input}"
+        # Limit conversation history (last 2 pairs)
         MAX_TURNS = 2
         trimmed_history = st.session_state.chat_history[-(MAX_TURNS * 2):]
         if trimmed_history and trimmed_history[-1]["role"] == "user":
             messages = trimmed_history[:-1] + [{"role": "user", "content": augmented_user_input}]
         else:
             messages = trimmed_history + [{"role": "user", "content": augmented_user_input}]
+        # ---- Set up a placeholder for the response and queue for streaming tokens ----
+        visible_placeholder = st.empty()
+        response_queue = queue.Queue()
+        # Function to stream LLM response and push incremental updates into the queue
+        def stream_response(msgs, max_tokens, temp, topk, topp, repeat_penalty):
+            final_text = ""
+            try:
+                stream = llm.create_chat_completion(
+                    messages=msgs,
+                    max_tokens=max_tokens,
+                    temperature=temp,
+                    top_k=topk,
+                    top_p=topp,
+                    repeat_penalty=repeat_penalty,
+                    stream=True,
+                )
+                for chunk in stream:
+                    if "choices" in chunk:
+                        delta = chunk["choices"][0]["delta"].get("content", "")
+                        final_text += delta
+                        response_queue.put(delta)
+                        if chunk["choices"][0].get("finish_reason", ""):
+                            break
+            except Exception as e:
+                response_queue.put(f"\nError: {e}")
+            response_queue.put(None)  # Signal completion
+        # Start streaming in a separate thread
+        stream_thread = threading.Thread(
+            target=stream_response,
+            args=(messages, max_tokens, temperature, top_k, top_p, repeat_penalty),
+            daemon=True
+        )
+        stream_thread.start()
+        # Poll the queue in the main thread for up to 5 seconds
+        final_response = ""
+        timeout = 120  # seconds
+        start_time = time.time()
+        while True:
+            try:
+                update = response_queue.get(timeout=0.1)
+                if update is None:
+                    break
+                final_response += update
+                visible_response = re.sub(r"<think>.*?</think>", "", final_response, flags=re.DOTALL)
+                visible_response = re.sub(r"<think>.*$", "", visible_response, flags=re.DOTALL)
+                visible_placeholder.markdown(visible_response)
+            except queue.Empty:
+                if time.time() - start_time > timeout:
+                    st.error("Response generation timed out.")
+                    break
+        st.session_state.chat_history.append({"role": "assistant", "content": final_response})
         st.session_state.pending_response = False
+        gc.collect()