Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,774 Bytes
0ff6c39 cd26609 37ee1f3 0ff6c39 cd26609 0813164 cd26609 0813164 cd26609 37ee1f3 cd26609 37ee1f3 cd26609 0813164 37ee1f3 0813164 37ee1f3 0813164 37ee1f3 0813164 37ee1f3 0813164 37ee1f3 0813164 cd26609 37ee1f3 cd26609 0813164 cd26609 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
import streamlit as st
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import gc
import shutil
# Available models
MODELS = {
"Qwen2.5-7B-Instruct (Q2_K)": {
"repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
"filename": "qwen2.5-7b-instruct-q2_k.gguf",
"description": "Qwen2.5-7B Instruct (Q2_K)"
},
"Gemma-3-4B-IT (Q4_K_M)": {
"repo_id": "unsloth/gemma-3-4b-it-GGUF",
"filename": "gemma-3-4b-it-Q4_K_M.gguf",
"description": "Gemma 3 4B IT (Q4_K_M)"
},
"Phi-4-mini-Instruct (Q4_K_M)": {
"repo_id": "unsloth/Phi-4-mini-instruct-GGUF",
"filename": "Phi-4-mini-instruct-Q4_K_M.gguf",
"description": "Phi-4 Mini Instruct (Q4_K_M)"
},
}
with st.sidebar:
st.header("⚙️ Settings")
selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
top_k = st.slider("Top-K", 1, 100, 40)
top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
# Model info
selected_model = MODELS[selected_model_name]
model_path = os.path.join("models", selected_model["filename"])
# Init state
if "model_name" not in st.session_state:
st.session_state.model_name = None
if "llm" not in st.session_state:
st.session_state.llm = None
# Make sure models dir exists
os.makedirs("models", exist_ok=True)
# If the selected model file does not exist or is invalid, clean up and re-download
def validate_or_download_model():
if not os.path.exists(model_path):
cleanup_old_models()
download_model()
return
try:
_ = Llama(model_path=model_path, n_ctx=16, n_threads=1) # dummy check
except Exception as e:
st.warning(f"Model file was invalid or corrupt: {e}\nRedownloading...")
cleanup_old_models()
download_model()
def cleanup_old_models():
for f in os.listdir("models"):
if f.endswith(".gguf") and f != selected_model["filename"]:
try:
os.remove(os.path.join("models", f))
except Exception as e:
st.warning(f"Couldn't delete old model {f}: {e}")
def download_model():
with st.spinner(f"Downloading {selected_model['filename']}..."):
hf_hub_download(
repo_id=selected_model["repo_id"],
filename=selected_model["filename"],
local_dir="./models",
local_dir_use_symlinks=False,
)
validate_or_download_model()
# Load model if changed
if st.session_state.model_name != selected_model_name:
if st.session_state.llm is not None:
del st.session_state.llm
gc.collect()
try:
st.session_state.llm = Llama(
model_path=model_path,
n_ctx=1024,
n_threads=2,
n_threads_batch=2,
n_batch=4,
n_gpu_layers=0,
use_mlock=False,
use_mmap=True,
verbose=False,
)
except Exception as e:
st.error(f"Failed to load model: {e}")
st.stop()
st.session_state.model_name = selected_model_name
llm = st.session_state.llm
# Chat history state
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
user_input = st.chat_input("Ask something...")
if user_input:
st.session_state.chat_history.append({"role": "user", "content": user_input})
with st.chat_message("user"):
st.markdown(user_input)
# Trim conversation history to max 8 turns (user+assistant)
MAX_TURNS = 8
trimmed_history = st.session_state.chat_history[-MAX_TURNS * 2:]
messages = [{"role": "system", "content": system_prompt}] + trimmed_history
with st.chat_message("assistant"):
full_response = ""
response_area = st.empty()
stream = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_k=top_k,
top_p=top_p,
repeat_penalty=repeat_penalty,
stream=True,
)
for chunk in stream:
if "choices" in chunk:
delta = chunk["choices"][0]["delta"].get("content", "")
full_response += delta
response_area.markdown(full_response)
st.session_state.chat_history.append({"role": "assistant", "content": full_response})
|