Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,156 Bytes
0ff6c39 cd26609 37ee1f3 0ff6c39 cd26609 37ee1f3 cd26609 37ee1f3 cd26609 37ee1f3 cd26609 37ee1f3 cd26609 37ee1f3 cd26609 37ee1f3 4443d46 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import streamlit as st
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import gc
import shutil
# Available models
MODELS = {
"Qwen2.5-7B-Instruct (Q2_K)": {
"repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
"filename": "qwen2.5-7b-instruct-q2_k.gguf",
"description": "Qwen2.5-7B Instruct (Q2_K)"
},
"Gemma-3-4B-IT (Q5_K_M)": {
"repo_id": "unsloth/gemma-3-4b-it-GGUF",
"filename": "gemma-3-4b-it-Q5_K_M.gguf",
"description": "Gemma 3 4B IT (Q5_K_M)"
},
"Phi-4-mini-Instruct (Q4_K_M)": {
"repo_id": "unsloth/Phi-4-mini-instruct-GGUF",
"filename": "Phi-4-mini-instruct-Q4_K_M.gguf",
"description": "Phi-4 Mini Instruct (Q4_K_M)"
},
}
with st.sidebar:
st.header("⚙️ Settings")
selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
top_k = st.slider("Top-K", 1, 100, 40)
top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
# Model info
selected_model = MODELS[selected_model_name]
model_path = os.path.join("models", selected_model["filename"])
# Make sure models dir exists
os.makedirs("models", exist_ok=True)
# Clear old models if new one isn't present
if not os.path.exists(model_path):
for file in os.listdir("models"):
if file.endswith(".gguf"):
try:
os.remove(os.path.join("models", file))
except Exception as e:
st.warning(f"Failed to delete {file}: {e}")
# Download the selected model
with st.spinner(f"Downloading {selected_model['filename']}..."):
hf_hub_download(
repo_id=selected_model["repo_id"],
filename=selected_model["filename"],
local_dir="./models",
local_dir_use_symlinks=False,
)
# Init state
if "model_name" not in st.session_state:
st.session_state.model_name = None
if "llm" not in st.session_state:
st.session_state.llm = None
# Load model if changed
if st.session_state.model_name != selected_model_name:
if st.session_state.llm is not None:
del st.session_state.llm
gc.collect()
st.session_state.llm = Llama(
model_path=model_path,
n_ctx=1024,
n_threads=2,
n_threads_batch=2,
n_batch=4,
n_gpu_layers=0,
use_mlock=False,
use_mmap=True,
verbose=False,
)
st.session_state.model_name = selected_model_name
llm = st.session_state.llm
# Chat history state
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
user_input = st.chat_input("Ask something...")
if user_input:
st.session_state.chat_history.append({"role": "user", "content": user_input})
with st.chat_message("user"):
st.markdown(user_input)
# Trim conversation history to max 8 turns (user+assistant)
MAX_TURNS = 8
trimmed_history = st.session_state.chat_history[-MAX_TURNS * 2:]
messages = [{"role": "system", "content": system_prompt}] + trimmed_history
with st.chat_message("assistant"):
full_response = ""
response_area = st.empty()
stream = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_k=top_k,
top_p=top_p,
repeat_penalty=repeat_penalty,
stream=True,
)
for chunk in stream:
if "choices" in chunk:
delta = chunk["choices"][0]["delta"].get("content", "")
full_response += delta
response_area.markdown(full_response)
st.session_state.chat_history.append({"role": "assistant", "content": full_response})
|