sitammeur commited on
Commit
26a8369
·
verified ·
1 Parent(s): 8f0339b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +214 -212
app.py CHANGED
@@ -1,212 +1,214 @@
1
- # Importing required libraries
2
- import warnings
3
- warnings.filterwarnings("ignore")
4
-
5
- import json
6
- import subprocess
7
- import sys
8
- from llama_cpp import Llama
9
- from llama_cpp_agent import LlamaCppAgent
10
- from llama_cpp_agent import MessagesFormatterType
11
- from llama_cpp_agent.providers import LlamaCppPythonProvider
12
- from llama_cpp_agent.chat_history import BasicChatHistory
13
- from llama_cpp_agent.chat_history.messages import Roles
14
- import gradio as gr
15
- from huggingface_hub import hf_hub_download
16
- from typing import List, Tuple
17
- from logger import logging
18
- from exception import CustomExceptionHandling
19
-
20
-
21
- # Download gguf model files
22
- llm = None
23
- llm_model = None
24
-
25
- hf_hub_download(
26
- repo_id="bartowski/SmolLM2-135M-Instruct-GGUF",
27
- filename="SmolLM2-135M-Instruct-Q6_K.gguf",
28
- local_dir="./models",
29
- )
30
- hf_hub_download(
31
- repo_id="bartowski/SmolLM2-360M-Instruct-GGUF",
32
- filename="SmolLM2-360M-Instruct-Q6_K.gguf",
33
- local_dir="./models",
34
- )
35
-
36
- # Set the title and description
37
- title = "SmolLM🤗 Llama.cpp"
38
- description = """SmolLM2, a family of three small language models, performs well in instruction following and reasoning. The largest model significantly improves over its predecessor through advanced training techniques."""
39
-
40
-
41
- def respond(
42
- message: str,
43
- history: List[Tuple[str, str]],
44
- model: str,
45
- system_message: str,
46
- max_tokens: int,
47
- temperature: float,
48
- top_p: float,
49
- top_k: int,
50
- repeat_penalty: float,
51
- ):
52
- """
53
- Respond to a message using the SmolLM2 model via Llama.cpp.
54
-
55
- Args:
56
- - message (str): The message to respond to.
57
- - history (List[Tuple[str, str]]): The chat history.
58
- - model (str): The model to use.
59
- - system_message (str): The system message to use.
60
- - max_tokens (int): The maximum number of tokens to generate.
61
- - temperature (float): The temperature of the model.
62
- - top_p (float): The top-p of the model.
63
- - top_k (int): The top-k of the model.
64
- - repeat_penalty (float): The repetition penalty of the model.
65
-
66
- Returns:
67
- str: The response to the message.
68
- """
69
- try:
70
- # Load the global variables
71
- global llm
72
- global llm_model
73
-
74
- # Load the model
75
- if llm is None or llm_model != model:
76
- llm = Llama(
77
- model_path=f"models/{model}",
78
- flash_attn=False,
79
- n_gpu_layers=0,
80
- n_batch=32,
81
- n_ctx=8192,
82
- )
83
- llm_model = model
84
- provider = LlamaCppPythonProvider(llm)
85
-
86
- # Create the agent
87
- agent = LlamaCppAgent(
88
- provider,
89
- system_prompt=f"{system_message}",
90
- predefined_messages_formatter_type=MessagesFormatterType.CHATML,
91
- debug_output=True,
92
- )
93
-
94
- # Set the settings like temperature, top-k, top-p, max tokens, etc.
95
- settings = provider.get_provider_default_settings()
96
- settings.temperature = temperature
97
- settings.top_k = top_k
98
- settings.top_p = top_p
99
- settings.max_tokens = max_tokens
100
- settings.repeat_penalty = repeat_penalty
101
- settings.stream = True
102
-
103
- messages = BasicChatHistory()
104
-
105
- # Add the chat history
106
- for msn in history:
107
- user = {"role": Roles.user, "content": msn[0]}
108
- assistant = {"role": Roles.assistant, "content": msn[1]}
109
- messages.add_message(user)
110
- messages.add_message(assistant)
111
-
112
- # Get the response stream
113
- stream = agent.get_chat_response(
114
- message,
115
- llm_sampling_settings=settings,
116
- chat_history=messages,
117
- returns_streaming_generator=True,
118
- print_output=False,
119
- )
120
-
121
- # Log the success
122
- logging.info("Response stream generated successfully")
123
-
124
- # Generate the response
125
- outputs = ""
126
- for output in stream:
127
- outputs += output
128
- yield outputs
129
-
130
- # Handle exceptions that may occur during the process
131
- except Exception as e:
132
- # Custom exception handling
133
- raise CustomExceptionHandling(e, sys) from e
134
-
135
-
136
- # Create a chat interface
137
- demo = gr.ChatInterface(
138
- respond,
139
- examples=[["What is the capital of France?"], ["Why is the color of the sky blue?"], ["What is gravity?"]],
140
- additional_inputs_accordion=gr.Accordion(
141
- label="⚙️ Parameters", open=False, render=False
142
- ),
143
- additional_inputs=[
144
- gr.Dropdown(
145
- choices=[
146
- "SmolLM2-135M-Instruct-Q6_K.gguf",
147
- "SmolLM2-360M-Instruct-Q6_K.gguf",
148
- ],
149
- value="SmolLM2-135M-Instruct-Q6_K.gguf",
150
- label="Model",
151
- info="Select the AI model to use for chat",
152
- ),
153
- gr.Textbox(
154
- value="You are a helpful AI assistant focused on accurate and ethical responses.",
155
- label="System Prompt",
156
- info="Define the AI assistant's personality and behavior",
157
- lines=2,
158
- ),
159
- gr.Slider(
160
- minimum=512,
161
- maximum=4096,
162
- value=2048,
163
- step=512,
164
- label="Max Tokens",
165
- info="Maximum length of response (higher = longer replies)",
166
- ),
167
- gr.Slider(
168
- minimum=0.1,
169
- maximum=2.0,
170
- value=0.7,
171
- step=0.1,
172
- label="Temperature",
173
- info="Creativity level (higher = more creative, lower = more focused)",
174
- ),
175
- gr.Slider(
176
- minimum=0.1,
177
- maximum=1.0,
178
- value=0.95,
179
- step=0.05,
180
- label="Top-p",
181
- info="Nucleus sampling threshold",
182
- ),
183
- gr.Slider(
184
- minimum=1,
185
- maximum=100,
186
- value=40,
187
- step=1,
188
- label="Top-k",
189
- info="Limit vocabulary choices to top K tokens",
190
- ),
191
- gr.Slider(
192
- minimum=1.0,
193
- maximum=2.0,
194
- value=1.1,
195
- step=0.1,
196
- label="Repetition Penalty",
197
- info="Penalize repeated words (higher = less repetition)",
198
- ),
199
- ],
200
- theme="Ocean",
201
- submit_btn="Send",
202
- stop_btn="Stop",
203
- title=title,
204
- description=description,
205
- chatbot=gr.Chatbot(scale=1, show_copy_button=True),
206
- flagging_mode="never",
207
- )
208
-
209
-
210
- # Launch the chat interface
211
- if __name__ == "__main__":
212
- demo.launch(debug=False)
 
 
 
1
+ # Importing required libraries
2
+ import warnings
3
+ warnings.filterwarnings("ignore")
4
+
5
+ import json
6
+ import subprocess
7
+ import sys
8
+ from llama_cpp import Llama
9
+ from llama_cpp_agent import LlamaCppAgent
10
+ from llama_cpp_agent import MessagesFormatterType
11
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
12
+ from llama_cpp_agent.chat_history import BasicChatHistory
13
+ from llama_cpp_agent.chat_history.messages import Roles
14
+ import gradio as gr
15
+ from huggingface_hub import hf_hub_download
16
+ from typing import List, Tuple
17
+ from logger import logging
18
+ from exception import CustomExceptionHandling
19
+
20
+
21
+ # Download gguf model files
22
+ llm = None
23
+ llm_model = None
24
+
25
+ hf_hub_download(
26
+ repo_id="bartowski/google_gemma-3-1b-it-GGUF",
27
+ filename="google_gemma-3-1b-it-Q6_K.gguf",
28
+ local_dir="./models",
29
+ )
30
+ hf_hub_download(
31
+ repo_id="bartowski/google_gemma-3-1b-it-GGUF",
32
+ filename="google_gemma-3-1b-it-Q5_K_M.gguf",
33
+ local_dir="./models",
34
+ )
35
+
36
+ # Set the title and description
37
+ title = "Gemma3 Llama.cpp"
38
+ description = """SmolLM2, a family of three small language models, performs well in instruction following and reasoning. The largest model significantly improves over its predecessor through advanced training techniques."""
39
+
40
+
41
+ def respond(
42
+ message: str,
43
+ history: List[Tuple[str, str]],
44
+ model: str,
45
+ system_message: str,
46
+ max_tokens: int,
47
+ temperature: float,
48
+ top_p: float,
49
+ top_k: int,
50
+ repeat_penalty: float,
51
+ ):
52
+ """
53
+ Respond to a message using the Gemma3 model via Llama.cpp.
54
+
55
+ Args:
56
+ - message (str): The message to respond to.
57
+ - history (List[Tuple[str, str]]): The chat history.
58
+ - model (str): The model to use.
59
+ - system_message (str): The system message to use.
60
+ - max_tokens (int): The maximum number of tokens to generate.
61
+ - temperature (float): The temperature of the model.
62
+ - top_p (float): The top-p of the model.
63
+ - top_k (int): The top-k of the model.
64
+ - repeat_penalty (float): The repetition penalty of the model.
65
+
66
+ Returns:
67
+ str: The response to the message.
68
+ """
69
+ try:
70
+ # Load the global variables
71
+ global llm
72
+ global llm_model
73
+
74
+ # Load the model
75
+ if llm is None or llm_model != model:
76
+ llm = Llama(
77
+ model_path=f"models/{model}",
78
+ flash_attn=False,
79
+ n_gpu_layers=0,
80
+ n_batch=8,
81
+ n_ctx=2048,
82
+ n_threads=2,
83
+ n_threads_batch=2,
84
+ )
85
+ llm_model = model
86
+ provider = LlamaCppPythonProvider(llm)
87
+
88
+ # Create the agent
89
+ agent = LlamaCppAgent(
90
+ provider,
91
+ system_prompt=f"{system_message}",
92
+ predefined_messages_formatter_type=MessagesFormatterType.GEMMA2,
93
+ debug_output=True,
94
+ )
95
+
96
+ # Set the settings like temperature, top-k, top-p, max tokens, etc.
97
+ settings = provider.get_provider_default_settings()
98
+ settings.temperature = temperature
99
+ settings.top_k = top_k
100
+ settings.top_p = top_p
101
+ settings.max_tokens = max_tokens
102
+ settings.repeat_penalty = repeat_penalty
103
+ settings.stream = True
104
+
105
+ messages = BasicChatHistory()
106
+
107
+ # Add the chat history
108
+ for msn in history:
109
+ user = {"role": Roles.user, "content": msn[0]}
110
+ assistant = {"role": Roles.assistant, "content": msn[1]}
111
+ messages.add_message(user)
112
+ messages.add_message(assistant)
113
+
114
+ # Get the response stream
115
+ stream = agent.get_chat_response(
116
+ message,
117
+ llm_sampling_settings=settings,
118
+ chat_history=messages,
119
+ returns_streaming_generator=True,
120
+ print_output=False,
121
+ )
122
+
123
+ # Log the success
124
+ logging.info("Response stream generated successfully")
125
+
126
+ # Generate the response
127
+ outputs = ""
128
+ for output in stream:
129
+ outputs += output
130
+ yield outputs
131
+
132
+ # Handle exceptions that may occur during the process
133
+ except Exception as e:
134
+ # Custom exception handling
135
+ raise CustomExceptionHandling(e, sys) from e
136
+
137
+
138
+ # Create a chat interface
139
+ demo = gr.ChatInterface(
140
+ respond,
141
+ examples=[["What is the capital of France?"], ["Tell me something about artificial intelligence."], ["What is gravity?"]],
142
+ additional_inputs_accordion=gr.Accordion(
143
+ label="⚙️ Parameters", open=False, render=False
144
+ ),
145
+ additional_inputs=[
146
+ gr.Dropdown(
147
+ choices=[
148
+ "google_gemma-3-1b-it-Q6_K.gguf",
149
+ "google_gemma-3-1b-it-Q5_K_M.gguf",
150
+ ],
151
+ value="google_gemma-3-1b-it-Q5_K_M.gguf",
152
+ label="Model",
153
+ info="Select the AI model to use for chat",
154
+ ),
155
+ gr.Textbox(
156
+ value="You are a helpful AI assistant focused on accurate and ethical responses.",
157
+ label="System Prompt",
158
+ info="Define the AI assistant's personality and behavior",
159
+ lines=2,
160
+ ),
161
+ gr.Slider(
162
+ minimum=512,
163
+ maximum=2048,
164
+ value=1024,
165
+ step=1,
166
+ label="Max Tokens",
167
+ info="Maximum length of response (higher = longer replies)",
168
+ ),
169
+ gr.Slider(
170
+ minimum=0.1,
171
+ maximum=2.0,
172
+ value=0.7,
173
+ step=0.1,
174
+ label="Temperature",
175
+ info="Creativity level (higher = more creative, lower = more focused)",
176
+ ),
177
+ gr.Slider(
178
+ minimum=0.1,
179
+ maximum=1.0,
180
+ value=0.95,
181
+ step=0.05,
182
+ label="Top-p",
183
+ info="Nucleus sampling threshold",
184
+ ),
185
+ gr.Slider(
186
+ minimum=1,
187
+ maximum=100,
188
+ value=40,
189
+ step=1,
190
+ label="Top-k",
191
+ info="Limit vocabulary choices to top K tokens",
192
+ ),
193
+ gr.Slider(
194
+ minimum=1.0,
195
+ maximum=2.0,
196
+ value=1.1,
197
+ step=0.1,
198
+ label="Repetition Penalty",
199
+ info="Penalize repeated words (higher = less repetition)",
200
+ ),
201
+ ],
202
+ theme="Ocean",
203
+ submit_btn="Send",
204
+ stop_btn="Stop",
205
+ title=title,
206
+ description=description,
207
+ chatbot=gr.Chatbot(scale=1, show_copy_button=True),
208
+ flagging_mode="never",
209
+ )
210
+
211
+
212
+ # Launch the chat interface
213
+ if __name__ == "__main__":
214
+ demo.launch(debug=False)