import streamlit as st from llama_cpp import Llama from huggingface_hub import hf_hub_download hf_hub_download( repo_id="Qwen/Qwen2.5-1.5B-Instruct-GGUF", filename="qwen2.5-1.5b-instruct-q4_k_m.gguf", local_dir="./models", ) # Load the model (on first run) @st.cache_resource def load_model(): return Llama( model_path="models/qwen2.5-1.5b-instruct-q4_k_m.gguf", n_ctx=1024, n_threads=2, n_threads_batch=2, n_batch=4, n_gpu_layers=0, use_mlock=False, use_mmap=True, verbose=False, ) llm = load_model() # Session state for chat history if "chat_history" not in st.session_state: st.session_state.chat_history = [] st.title("🧠 Qwen2.5-1.5B-Instruct (Streamlit + GGUF)") st.caption("Powered by `llama.cpp` and `llama-cpp-python` | 4-bit Q4_K_M inference") with st.sidebar: st.header("⚙️ Settings") system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80) max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32) temperature = st.slider("Temperature", 0.1, 2.0, 0.7) top_k = st.slider("Top-K", 1, 100, 40) top_p = st.slider("Top-P", 0.1, 1.0, 0.95) repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1) # Input box user_input = st.chat_input("Ask something...") if user_input: # Add user message to chat st.session_state.chat_history.append({"role": "user", "content": user_input}) # Display user message with st.chat_message("user"): st.markdown(user_input) # Construct the prompt messages = [{"role": "system", "content": system_prompt}] + st.session_state.chat_history # Stream response with st.chat_message("assistant"): full_response = "" response_area = st.empty() stream = llm.create_chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, top_k=top_k, top_p=top_p, repeat_penalty=repeat_penalty, stream=True, ) for chunk in stream: if "choices" in chunk: delta = chunk["choices"][0]["delta"].get("content", "") full_response += delta response_area.markdown(full_response) st.session_state.chat_history.append({"role": "assistant", "content": full_response})