Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,834 Bytes
0ff6c39 cd26609 37ee1f3 cc91a1a 0ff6c39 cd26609 0813164 cd26609 0813164 cd26609 37ee1f3 cd26609 37ee1f3 cd26609 cc91a1a cd26609 0813164 6e8312c 37ee1f3 0813164 37ee1f3 0813164 37ee1f3 0813164 37ee1f3 0813164 37ee1f3 6e8312c cd26609 37ee1f3 cd26609 6e8312c cd26609 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 cd26609 0ff6c39 3e4847c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
import streamlit as st
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import gc
import shutil
import subprocess
# Available models
MODELS = {
"Qwen2.5-7B-Instruct (Q2_K)": {
"repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
"filename": "qwen2.5-7b-instruct-q2_k.gguf",
"description": "Qwen2.5-7B Instruct (Q2_K)"
},
"Gemma-3-4B-IT (Q4_K_M)": {
"repo_id": "unsloth/gemma-3-4b-it-GGUF",
"filename": "gemma-3-4b-it-Q4_K_M.gguf",
"description": "Gemma 3 4B IT (Q4_K_M)"
},
"Phi-4-mini-Instruct (Q4_K_M)": {
"repo_id": "unsloth/Phi-4-mini-instruct-GGUF",
"filename": "Phi-4-mini-instruct-Q4_K_M.gguf",
"description": "Phi-4 Mini Instruct (Q4_K_M)"
},
}
with st.sidebar:
st.header("⚙️ Settings")
selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
top_k = st.slider("Top-K", 1, 100, 40)
top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
if st.button("🧹 Clear All Cached Models"):
try:
for f in os.listdir("models"):
if f.endswith(".gguf"):
os.remove(os.path.join("models", f))
st.success("Model cache cleared.")
except Exception as e:
st.error(f"Failed to clear models: {e}")
if st.button("📦 Show Disk Usage"):
try:
usage = shutil.disk_usage(".")
used = usage.used / (1024**3)
free = usage.free / (1024**3)
st.info(f"Disk Used: {used:.2f} GB | Free: {free:.2f} GB")
except Exception as e:
st.error(f"Disk usage error: {e}")
# Model info
selected_model = MODELS[selected_model_name]
model_path = os.path.join("models", selected_model["filename"])
# Init state
if "model_name" not in st.session_state:
st.session_state.model_name = None
if "llm" not in st.session_state:
st.session_state.llm = None
# Ensure model directory exists
os.makedirs("models", exist_ok=True)
def cleanup_old_models():
for f in os.listdir("models"):
if f.endswith(".gguf") and f != selected_model["filename"]:
try:
os.remove(os.path.join("models", f))
except Exception as e:
st.warning(f"Couldn't delete old model {f}: {e}")
def download_model():
with st.spinner(f"Downloading {selected_model['filename']}..."):
hf_hub_download(
repo_id=selected_model["repo_id"],
filename=selected_model["filename"],
local_dir="./models",
local_dir_use_symlinks=False,
)
def try_load_model(path):
try:
return Llama(model_path=path, n_ctx=1024, n_threads=2, n_threads_batch=2, n_batch=4, n_gpu_layers=0, use_mlock=False, use_mmap=True, verbose=False)
except Exception as e:
return str(e)
def validate_or_download_model():
if not os.path.exists(model_path):
cleanup_old_models()
download_model()
# First load attempt
result = try_load_model(model_path)
if isinstance(result, str):
st.warning(f"Initial load failed: {result}\nAttempting re-download...")
try:
os.remove(model_path)
except:
pass
cleanup_old_models()
download_model()
result = try_load_model(model_path)
if isinstance(result, str):
st.error(f"Model still failed after re-download: {result}")
st.stop()
return result
return result
# Load model if changed
if st.session_state.model_name != selected_model_name:
if st.session_state.llm is not None:
del st.session_state.llm
gc.collect()
st.session_state.llm = validate_or_download_model()
st.session_state.model_name = selected_model_name
llm = st.session_state.llm
# Chat history state
if "chat_history" not in st.session_state:
st.session_state.chat_history = []
st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")
user_input = st.chat_input("Ask something...")
if user_input:
# Prevent appending user message if assistant hasn't replied yet
if len(st.session_state.chat_history) % 2 == 1:
st.warning("Please wait for the assistant to respond before sending another message.")
else:
st.session_state.chat_history.append({"role": "user", "content": user_input})
with st.chat_message("user"):
st.markdown(user_input)
# Trim conversation history to max 8 turns (user+assistant)
MAX_TURNS = 8
trimmed_history = st.session_state.chat_history[-MAX_TURNS * 2:]
messages = [{"role": "system", "content": system_prompt}] + trimmed_history
with st.chat_message("assistant"):
full_response = ""
response_area = st.empty()
stream = llm.create_chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_k=top_k,
top_p=top_p,
repeat_penalty=repeat_penalty,
stream=True,
)
for chunk in stream:
if "choices" in chunk:
delta = chunk["choices"][0]["delta"].get("content", "")
full_response += delta
response_area.markdown(full_response)
st.session_state.chat_history.append({"role": "assistant", "content": full_response})
|