Spaces:

Luigi
/

ZeroGPU-LLM-Inference

Running on Zero

File size: 2,404 Bytes

0ff6c39
 
 
 
 
ea7adad
 
0ff6c39
 
 
 
 
 
 
ea7adad
4443d46
 
 
 
0ff6c39
 
 
 
 
 
 
 
 
 
 
 
ea7adad
88b1f39
0ff6c39

import streamlit as st
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

hf_hub_download(
    repo_id="Qwen/Qwen2.5-7B-Instruct-GGUF",
    filename="qwen2.5-7b-instruct-q2_k.gguf",
    local_dir="./models",
)

# Load the model (on first run)
@st.cache_resource
def load_model():
    return Llama(
        model_path="models/qwen2.5-7b-instruct-q2_k.gguf",
        n_ctx=1024,
        n_threads=2,
        n_threads_batch=2,
        n_batch=4,
        n_gpu_layers=0,
        use_mlock=False,
        use_mmap=True,
        verbose=False,
    )

llm = load_model()

# Session state for chat history
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []

st.title("🧠 Qwen2.5-7B-Instruct (Streamlit + GGUF)")
st.caption("Powered by `llama.cpp` and `llama-cpp-python` | 2-bit Q2_K inference")

with st.sidebar:
    st.header("⚙️ Settings")
    system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
    max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
    temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
    top_k = st.slider("Top-K", 1, 100, 40)
    top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
    repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)

# Input box
user_input = st.chat_input("Ask something...")

if user_input:
    # Add user message to chat
    st.session_state.chat_history.append({"role": "user", "content": user_input})

    # Display user message
    with st.chat_message("user"):
        st.markdown(user_input)

    # Construct the prompt
    messages = [{"role": "system", "content": system_prompt}] + st.session_state.chat_history

    # Stream response
    with st.chat_message("assistant"):
        full_response = ""
        response_area = st.empty()
        stream = llm.create_chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            repeat_penalty=repeat_penalty,
            stream=True,
        )

        for chunk in stream:
            if "choices" in chunk:
                delta = chunk["choices"][0]["delta"].get("content", "")
                full_response += delta
                response_area.markdown(full_response)

        st.session_state.chat_history.append({"role": "assistant", "content": full_response})