File size: 3,783 Bytes
0ff6c39
 
 
cd26609
 
0ff6c39
cd26609
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4443d46
 
 
0ff6c39
 
 
 
 
cd26609
0ff6c39
cd26609
0ff6c39
cd26609
0ff6c39
 
 
cd26609
 
0ff6c39
 
 
 
 
 
 
 
 
cd26609
 
 
 
0ff6c39
 
 
 
cd26609
0ff6c39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import streamlit as st
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import os
import gc

# Available models
MODELS = {
    "Qwen2.5-7B-Instruct (Q2_K)": {
        "repo_id": "Qwen/Qwen2.5-7B-Instruct-GGUF",
        "filename": "qwen2.5-7b-instruct-q2_k.gguf",
        "description": "Qwen2.5-7B Instruct (Q2_K)"
    },
    "Gemma-3-4B-IT (Q5_K_M)": {
        "repo_id": "unsloth/gemma-3-4b-it-GGUF",
        "filename": "gemma-3-4b-it-Q5_K_M.gguf",
        "description": "Gemma 3 4B IT (Q5_K_M)"
    },
    "Phi-4-mini-Instruct (Q5_K_M)": {
        "repo_id": "unsloth/Phi-4-mini-instruct-GGUF",
        "filename": "Phi-4-mini-instruct-Q5_K_M.gguf",
        "description": "Phi-4 Mini Instruct (Q5_K_M)"
    },
}

with st.sidebar:
    st.header("⚙️ Settings")
    selected_model_name = st.selectbox("Select Model", list(MODELS.keys()))
    system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
    max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
    temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
    top_k = st.slider("Top-K", 1, 100, 40)
    top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
    repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)

# Model info
selected_model = MODELS[selected_model_name]
model_path = os.path.join("models", selected_model["filename"])

# Initialize model cache state
if "model_name" not in st.session_state:
    st.session_state.model_name = None
if "llm" not in st.session_state:
    st.session_state.llm = None

# Download model if needed
if not os.path.exists(model_path):
    hf_hub_download(
        repo_id=selected_model["repo_id"],
        filename=selected_model["filename"],
        local_dir="./models",
        local_dir_use_symlinks=False,
    )

# Load model only if it changed
if st.session_state.model_name != selected_model_name:
    if st.session_state.llm is not None:
        # Clean up old model to free memory
        del st.session_state.llm
        gc.collect()
    st.session_state.llm = Llama(
        model_path=model_path,
        n_ctx=1024,  # Reduced for RAM safety
        n_threads=2,
        n_threads_batch=2,
        n_batch=4,
        n_gpu_layers=0,
        use_mlock=False,
        use_mmap=True,
        verbose=False,
    )
    st.session_state.model_name = selected_model_name

llm = st.session_state.llm

# Chat history state
if "chat_history" not in st.session_state:
    st.session_state.chat_history = []

st.title(f"🧠 {selected_model['description']} (Streamlit + GGUF)")
st.caption(f"Powered by `llama.cpp` | Model: {selected_model['filename']}")

user_input = st.chat_input("Ask something...")

if user_input:
    st.session_state.chat_history.append({"role": "user", "content": user_input})

    with st.chat_message("user"):
        st.markdown(user_input)

    # Trim conversation history to max 8 turns (user+assistant)
    MAX_TURNS = 8
    trimmed_history = st.session_state.chat_history[-MAX_TURNS * 2:]
    messages = [{"role": "system", "content": system_prompt}] + trimmed_history

    with st.chat_message("assistant"):
        full_response = ""
        response_area = st.empty()

        stream = llm.create_chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p,
            repeat_penalty=repeat_penalty,
            stream=True,
        )

        for chunk in stream:
            if "choices" in chunk:
                delta = chunk["choices"][0]["delta"].get("content", "")
                full_response += delta
                response_area.markdown(full_response)

        st.session_state.chat_history.append({"role": "assistant", "content": full_response})