Luigi commited on
Commit
0ff6c39
·
1 Parent(s): ef1afaf

Add app.py & requirements.txt

Browse files
Files changed (2) hide show
  1. app.py +77 -0
  2. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
+
5
+ hf_hub_download(
6
+ repo_id="TheBloke/Qwen2.5-3B-Instruct-GGUF",
7
+ filename="Qwen2.5-3B-Instruct.Q4_K_M.gguf",
8
+ local_dir="./models",
9
+ )
10
+
11
+ # Load the model (on first run)
12
+ @st.cache_resource
13
+ def load_model():
14
+ return Llama(
15
+ model_path="models/Qwen2.5-3B-Instruct.Q4_K_M.gguf",
16
+ n_ctx=2048,
17
+ n_threads=6,
18
+ n_batch=8,
19
+ n_gpu_layers=0,
20
+ use_mlock=False,
21
+ use_mmap=True,
22
+ verbose=False,
23
+ )
24
+
25
+ llm = load_model()
26
+
27
+ # Session state for chat history
28
+ if "chat_history" not in st.session_state:
29
+ st.session_state.chat_history = []
30
+
31
+ st.title("🧠 Qwen2.5-3B-Instruct (Streamlit + GGUF)")
32
+ st.caption("Powered by `llama.cpp` and `llama-cpp-python` | 4-bit Q4_K_M inference")
33
+
34
+ with st.sidebar:
35
+ st.header("⚙️ Settings")
36
+ system_prompt = st.text_area("System Prompt", value="You are a helpful assistant.", height=80)
37
+ max_tokens = st.slider("Max tokens", 64, 2048, 512, step=32)
38
+ temperature = st.slider("Temperature", 0.1, 2.0, 0.7)
39
+ top_k = st.slider("Top-K", 1, 100, 40)
40
+ top_p = st.slider("Top-P", 0.1, 1.0, 0.95)
41
+ repeat_penalty = st.slider("Repetition Penalty", 1.0, 2.0, 1.1)
42
+
43
+ # Input box
44
+ user_input = st.chat_input("Ask something...")
45
+
46
+ if user_input:
47
+ # Add user message to chat
48
+ st.session_state.chat_history.append({"role": "user", "content": user_input})
49
+
50
+ # Display user message
51
+ with st.chat_message("user"):
52
+ st.markdown(user_input)
53
+
54
+ # Construct the prompt
55
+ messages = [{"role": "system", "content": system_prompt}] + st.session_state.chat_history
56
+
57
+ # Stream response
58
+ with st.chat_message("assistant"):
59
+ full_response = ""
60
+ response_area = st.empty()
61
+ stream = llm.create_chat_completion(
62
+ messages=messages,
63
+ max_tokens=max_tokens,
64
+ temperature=temperature,
65
+ top_k=top_k,
66
+ top_p=top_p,
67
+ repeat_penalty=repeat_penalty,
68
+ stream=True,
69
+ )
70
+
71
+ for chunk in stream:
72
+ if "choices" in chunk:
73
+ delta = chunk["choices"][0]["delta"].get("content", "")
74
+ full_response += delta
75
+ response_area.markdown(full_response)
76
+
77
+ st.session_state.chat_history.append({"role": "assistant", "content": full_response})
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ llama-cpp-python==0.2.73
2
+ llama-cpp-agent
3
+ huggingface_hub
4
+ streamlit