Luigi commited on
Commit
cd26609
·
1 Parent(s): 56919fd

provide more models, secure memory usage

Browse files
Files changed (2) hide show
  1. README.md +25 -4
  2. app.py +69 -31
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Qwen2.5 7B Instruct Llama.cpp
3
- emoji: 🌍
4
  colorFrom: pink
5
  colorTo: purple
6
  sdk: streamlit
@@ -8,7 +8,28 @@ sdk_version: 1.44.1
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: Run Qwen2.5-7B on Llama.cpp
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Multi-GGUF LLM Inference
3
+ emoji: 🧠
4
  colorFrom: pink
5
  colorTo: purple
6
  sdk: streamlit
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Run multiple GGUF models (Qwen2.5, Gemma-3, Phi-4) via llama.cpp
12
  ---
13
 
14
+ This Streamlit app lets you run **chat-based inference** on different GGUF models with `llama.cpp` and `llama-cpp-python`.
15
+
16
+ ### 🔄 Supported Models:
17
+ - `Qwen/Qwen2.5-7B-Instruct-GGUF` → `qwen2.5-7b-instruct-q2_k.gguf`
18
+ - `unsloth/gemma-3-4b-it-GGUF` → `gemma-3-4b-it-Q5_K_M.gguf`
19
+ - `unsloth/Phi-4-mini-instruct-GGUF` → `Phi-4-mini-instruct-Q5_K_M.gguf`
20
+
21
+ ### ⚙️ Features:
22
+ - Model selection in sidebar
23
+ - Custom system prompt and generation parameters
24
+ - Chat-style UI with streaming responses
25
+
26
+ ### 🧠 Memory-Safe Design (for HuggingFace Spaces):
27
+ - Only **one model is loaded at a time** (no persistent memory bloat)
28
+ - Uses **manual unloading and `gc.collect()`** to free memory when switching
29
+ - Reduces `n_ctx` context length to stay under 16 GB RAM limit
30
+ - Automatically downloads models only when needed
31
+ - Trims history to the **last 8 user-assistant turns** to avoid context overflow
32
+
33
+ Perfect for deploying multi-GGUF chat models on **free-tier HuggingFace Spaces**!
34
+
35
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,19 +1,66 @@
1
  import streamlit as st
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
 
 
4
 
5
- hf_hub_download(
6
- repo_id="Qwen/Qwen2.5-7B-Instruct-GGUF",
7
- filename="qwen2.5-7b-instruct-q2_k.gguf",
8
- local_dir="./models",
9
- )
10
-
11
- # Load the model (on first run)
12
- @st.cache_resource
13
- def load_model():
14
- return Llama(
15
- model_path="models/qwen2.5-7b-instruct-q2_k.gguf",
16
- n_ctx=2048,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  n_threads=2,
18
  n_threads_batch=2,
19
  n_batch=4,
@@ -22,43 +69,34 @@ def load_model():
22
  use_mmap=True,
23
  verbose=False,
24
  )
 
25
 
26
- llm = load_model()
27
 
28
- # Session state for chat history
29
  if "chat_history" not in st.session_state:
30
  st.session_state.chat_history = []
31
 
32
- st.title("🧠 Qwen2.5-7B-Instruct (Streamlit + GGUF)")
33
- st.caption("Powered by `llama.cpp` and `llama-cpp-python` | 2-bit Q2_K inference")
34
-
35
- with st.sidebar:
36
- st.header("⚙️ Settings")
37
- system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
38
- max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
39
- temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
40
- top_k = st.slider("Top-K", 1, 100, 40)
41
- top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
42
- repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
43
 
44
- # Input box
45
  user_input = st.chat_input("Ask something...")
46
 
47
  if user_input:
48
- # Add user message to chat
49
  st.session_state.chat_history.append({"role": "user", "content": user_input})
50
 
51
- # Display user message
52
  with st.chat_message("user"):
53
  st.markdown(user_input)
54
 
55
- # Construct the prompt
56
- messages = [{"role": "system", "content": system_prompt}] + st.session_state.chat_history
 
 
57
 
58
- # Stream response
59
  with st.chat_message("assistant"):
60
  full_response = ""
61
  response_area = st.empty()
 
62
  stream = llm.create_chat_completion(
63
  messages=messages,
64
  max_tokens=max_tokens,
 
1
  import streamlit as st
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
+ import os
5
+ import gc
6
 
7
+ # Available models
8
+ MODELS = {
9
+ "Qwen2.5-7B-Instruct (Q2_K)": {
10
+ "repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
11
+ "filename": "qwen2.5-7b-instruct-q2_k.gguf",
12
+ "description": "Qwen2.5-7B Instruct (Q2_K)"
13
+ },
14
+ "Gemma-3-4B-IT (Q5_K_M)": {
15
+ "repo_id": "unsloth/gemma-3-4b-it-GGUF",
16
+ "filename": "gemma-3-4b-it-Q5_K_M.gguf",
17
+ "description": "Gemma 3 4B IT (Q5_K_M)"
18
+ },
19
+ "Phi-4-mini-Instruct (Q5_K_M)": {
20
+ "repo_id": "unsloth/Phi-4-mini-instruct-GGUF",
21
+ "filename": "Phi-4-mini-instruct-Q5_K_M.gguf",
22
+ "description": "Phi-4 Mini Instruct (Q5_K_M)"
23
+ },
24
+ }
25
+
26
+ with st.sidebar:
27
+ st.header("⚙️ Settings")
28
+ selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
29
+ system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
30
+ max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
31
+ temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
32
+ top_k = st.slider("Top-K", 1, 100, 40)
33
+ top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
34
+ repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
35
+
36
+ # Model info
37
+ selected_model = MODELS[selected_model_name]
38
+ model_path = os.path.join("models", selected_model["filename"])
39
+
40
+ # Initialize model cache state
41
+ if "model_name" not in st.session_state:
42
+ st.session_state.model_name = None
43
+ if "llm" not in st.session_state:
44
+ st.session_state.llm = None
45
+
46
+ # Download model if needed
47
+ if not os.path.exists(model_path):
48
+ hf_hub_download(
49
+ repo_id=selected_model["repo_id"],
50
+ filename=selected_model["filename"],
51
+ local_dir="./models",
52
+ local_dir_use_symlinks=False,
53
+ )
54
+
55
+ # Load model only if it changed
56
+ if st.session_state.model_name != selected_model_name:
57
+ if st.session_state.llm is not None:
58
+ # Clean up old model to free memory
59
+ del st.session_state.llm
60
+ gc.collect()
61
+ st.session_state.llm = Llama(
62
+ model_path=model_path,
63
+ n_ctx=1024, # Reduced for RAM safety
64
  n_threads=2,
65
  n_threads_batch=2,
66
  n_batch=4,
 
69
  use_mmap=True,
70
  verbose=False,
71
  )
72
+ st.session_state.model_name = selected_model_name
73
 
74
+ llm = st.session_state.llm
75
 
76
+ # Chat history state
77
  if "chat_history" not in st.session_state:
78
  st.session_state.chat_history = []
79
 
80
+ st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
81
+ st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
 
 
 
 
 
 
 
 
 
82
 
 
83
  user_input = st.chat_input("Ask something...")
84
 
85
  if user_input:
 
86
  st.session_state.chat_history.append({"role": "user", "content": user_input})
87
 
 
88
  with st.chat_message("user"):
89
  st.markdown(user_input)
90
 
91
+ # Trim conversation history to max 8 turns (user+assistant)
92
+ MAX_TURNS = 8
93
+ trimmed_history = st.session_state.chat_history[-MAX_TURNS * 2:]
94
+ messages = [{"role": "system", "content": system_prompt}] + trimmed_history
95
 
 
96
  with st.chat_message("assistant"):
97
  full_response = ""
98
  response_area = st.empty()
99
+
100
  stream = llm.create_chat_completion(
101
  messages=messages,
102
  max_tokens=max_tokens,