Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running on Zero

App Files Files Community

Luigi commited on Apr 9

Commit

0ff6c39

1 Parent(s): ef1afaf

Add app.py & requirements.txt

Browse files

Files changed (2) hide show

app.py +77 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import streamlit as st
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+hf_hub_download(
+    repo_id="TheBloke/Qwen2.5-3B-Instruct-GGUF",
+    filename="Qwen2.5-3B-Instruct.Q4_K_M.gguf",
+    local_dir="./models",
+)
+# Load the model (on first run)
+@st.cache_resource
+def load_model():
+    return Llama(
+        model_path="models/Qwen2.5-3B-Instruct.Q4_K_M.gguf",
+        n_ctx=2048,
+        n_threads=6,
+        n_batch=8,
+        n_gpu_layers=0,
+        use_mlock=False,
+        use_mmap=True,
+        verbose=False,
+    )
+llm = load_model()
+# Session state for chat history
+if "chat_history" not in st.session_state:
+    st.session_state.chat_history = []
+st.title("🧠 Qwen2.5-3B-Instruct (Streamlit + GGUF)")
+st.caption("Powered by `llama.cpp` and `llama-cpp-python` | 4-bit Q4_K_M inference")
+with st.sidebar:
+    st.header("⚙️ Settings")
+    system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
+    max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
+    temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
+    top_k = st.slider("Top-K", 1, 100, 40)
+    top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
+    repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
+# Input box
+user_input = st.chat_input("Ask something...")
+if user_input:
+    # Add user message to chat
+    st.session_state.chat_history.append({"role": "user", "content": user_input})
+    # Display user message
+    with st.chat_message("user"):
+        st.markdown(user_input)
+    # Construct the prompt
+    messages = [{"role": "system", "content": system_prompt}] + st.session_state.chat_history
+    # Stream response
+    with st.chat_message("assistant"):
+        full_response = ""
+        response_area = st.empty()
+        stream = llm.create_chat_completion(
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            repeat_penalty=repeat_penalty,
+            stream=True,
+        )
+        for chunk in stream:
+            if "choices" in chunk:
+                delta = chunk["choices"][0]["delta"].get("content", "")
+                full_response += delta
+                response_area.markdown(full_response)
+        st.session_state.chat_history.append({"role": "assistant", "content": full_response})

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+llama-cpp-python==0.2.73
+llama-cpp-agent
+huggingface_hub
+streamlit