Nymbo commited on
Commit
a430d0d
·
verified ·
1 Parent(s): 5b1509d

adding custom models support, featured models tab, information tab, better model selection logic

Browse files
Files changed (1) hide show
  1. app.py +310 -61
app.py CHANGED
@@ -1,12 +1,12 @@
 
1
  import gradio as gr
2
  from openai import OpenAI
3
- import os
4
 
5
- # Retrieve the access token from the environment variable
6
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
7
  print("Access token loaded.")
8
 
9
- # Initialize the OpenAI client with the Hugging Face Inference API endpoint
10
  client = OpenAI(
11
  base_url="https://api-inference.huggingface.co/v1/",
12
  api_key=ACCESS_TOKEN,
@@ -21,34 +21,48 @@ def respond(
21
  temperature,
22
  top_p,
23
  frequency_penalty,
24
- seed
 
 
25
  ):
26
  """
27
- This function handles the chatbot response. It takes in:
28
- - message: the user's new message
29
- - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
30
- - system_message: the system prompt
31
- - max_tokens: the maximum number of tokens to generate in the response
32
- - temperature: sampling temperature
33
- - top_p: top-p (nucleus) sampling
34
- - frequency_penalty: penalize repeated tokens in the output
35
- - seed: a fixed seed for reproducibility; -1 will mean 'random'
36
- """
37
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  print(f"Received message: {message}")
39
  print(f"History: {history}")
40
  print(f"System message: {system_message}")
41
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
42
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
 
 
43
 
44
- # Convert seed to None if -1 (meaning random)
 
 
 
 
 
 
 
 
 
45
  if seed == -1:
46
  seed = None
47
 
48
- # Construct the messages array required by the API
49
  messages = [{"role": "system", "content": system_message}]
50
-
51
- # Add conversation history to the context
52
  for val in history:
53
  user_part = val[0]
54
  assistant_part = val[1]
@@ -59,66 +73,301 @@ def respond(
59
  messages.append({"role": "assistant", "content": assistant_part})
60
  print(f"Added assistant message to context: {assistant_part}")
61
 
62
- # Append the latest user message
63
  messages.append({"role": "user", "content": message})
64
 
65
- # Start with an empty string to build the response as tokens stream in
66
  response = ""
67
- print("Sending request to OpenAI API.")
68
-
69
- # Make the streaming request to the HF Inference API via openai-like client
70
  for message_chunk in client.chat.completions.create(
71
- model="meta-llama/Llama-3.3-70B-Instruct", # You can update this to your specific model
72
  max_tokens=max_tokens,
73
- stream=True, # Stream the response
74
  temperature=temperature,
75
  top_p=top_p,
76
- frequency_penalty=frequency_penalty, # <-- NEW
77
- seed=seed, # <-- NEW
78
  messages=messages,
79
  ):
80
- # Extract the token text from the response chunk
81
  token_text = message_chunk.choices[0].delta.content
82
  print(f"Received token: {token_text}")
83
  response += token_text
 
84
  yield response
85
 
86
  print("Completed response generation.")
87
 
88
- # Create a Chatbot component with a specified height
89
- chatbot = gr.Chatbot(height=600)
90
- print("Chatbot interface created.")
91
-
92
- # Create the Gradio ChatInterface
93
- # We add two new sliders for Frequency Penalty and Seed
94
- demo = gr.ChatInterface(
95
- respond,
96
- additional_inputs=[
97
- gr.Textbox(value="", label="System message"),
98
- gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
99
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
100
- gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
101
- gr.Slider(
102
- minimum=-2.0,
103
- maximum=2.0,
104
- value=0.0,
105
- step=0.1,
106
- label="Frequency Penalty"
107
- ),
108
- gr.Slider(
109
- minimum=-1,
110
- maximum=65535, # Arbitrary upper limit for demonstration
111
- value=-1,
112
- step=1,
113
- label="Seed (-1 for random)"
114
- ),
115
- ],
116
- fill_height=True,
117
- chatbot=chatbot,
118
- theme="Nymbo/Nymbo_Theme",
119
- )
120
- print("Gradio interface initialized.")
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  if __name__ == "__main__":
123
  print("Launching the demo application.")
 
124
  demo.launch()
 
1
+ import os
2
  import gradio as gr
3
  from openai import OpenAI
 
4
 
5
+ # Load your Hugging Face Inference API token from environment
6
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
7
  print("Access token loaded.")
8
 
9
+ # Initialize the OpenAI-like client that points to the HF Inference endpoint
10
  client = OpenAI(
11
  base_url="https://api-inference.huggingface.co/v1/",
12
  api_key=ACCESS_TOKEN,
 
21
  temperature,
22
  top_p,
23
  frequency_penalty,
24
+ seed,
25
+ featured_model, # Selected from "Featured Models" radio
26
+ custom_model # Optional user-provided custom model path
27
  ):
28
  """
29
+ Respond to user messages using the Hugging Face Inference API with OpenAI-like syntax.
 
 
 
 
 
 
 
 
 
30
 
31
+ Parameters:
32
+ - message (str): The latest user message
33
+ - history (list of tuples): The conversation history [(user_msg, assistant_msg), ...]
34
+ - system_message (str): System-level instruction or context
35
+ - max_tokens (int): Max tokens to generate
36
+ - temperature (float): Sampling temperature
37
+ - top_p (float): Nucleus sampling (top-p)
38
+ - frequency_penalty (float): Penalize repeated tokens
39
+ - seed (int): Fixed seed; if -1 => random
40
+ - featured_model (str): The featured model name selected in the UI
41
+ - custom_model (str): A custom model path (HF repo) provided by the user
42
+ """
43
  print(f"Received message: {message}")
44
  print(f"History: {history}")
45
  print(f"System message: {system_message}")
46
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
47
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
48
+ print(f"Featured Model (chosen): {featured_model}")
49
+ print(f"Custom Model (if any): {custom_model}")
50
 
51
+ # Decide which model to use. If the user typed a custom model, we use that.
52
+ # Otherwise, we use the featured model they picked from the radio.
53
+ if custom_model.strip():
54
+ model_to_use = custom_model.strip()
55
+ else:
56
+ model_to_use = featured_model
57
+
58
+ print(f"Final model to use: {model_to_use}")
59
+
60
+ # Convert seed to None if -1 => means random
61
  if seed == -1:
62
  seed = None
63
 
64
+ # Prepare the conversation
65
  messages = [{"role": "system", "content": system_message}]
 
 
66
  for val in history:
67
  user_part = val[0]
68
  assistant_part = val[1]
 
73
  messages.append({"role": "assistant", "content": assistant_part})
74
  print(f"Added assistant message to context: {assistant_part}")
75
 
76
+ # Add the latest user message
77
  messages.append({"role": "user", "content": message})
78
 
79
+ # Generate the response in a streaming manner
80
  response = ""
81
+ print("Sending request to HF Inference API via OpenAI-like client.")
 
 
82
  for message_chunk in client.chat.completions.create(
83
+ model=model_to_use,
84
  max_tokens=max_tokens,
85
+ stream=True,
86
  temperature=temperature,
87
  top_p=top_p,
88
+ frequency_penalty=frequency_penalty,
89
+ seed=seed,
90
  messages=messages,
91
  ):
 
92
  token_text = message_chunk.choices[0].delta.content
93
  print(f"Received token: {token_text}")
94
  response += token_text
95
+ # Yield partial responses to get streaming in Gradio
96
  yield response
97
 
98
  print("Completed response generation.")
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
+ # ----------------------------
102
+ # DEFINE THE GRADIO INTERFACE
103
+ # ----------------------------
104
+ def build_demo():
105
+ """
106
+ Build the entire Gradio Blocks interface, featuring:
107
+ - A Tab for the chatbot (with featured models, custom model)
108
+ - An Information tab with model table, parameter overview, etc.
109
+ """
110
+ # Define your placeholder featured models
111
+ featured_models_list = [
112
+ "meta-llama/Llama-3.3-70B-Instruct",
113
+ "Qwen/Qwen2.5-7B-Instruct",
114
+ "google/gemma-2-2b-it",
115
+ "microsoft/Phi-3-mini-4k-instruct",
116
+ ]
117
+
118
+ with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
119
+ gr.Markdown("## Serverless Text Generation Hub")
120
+
121
+ with gr.Tabs():
122
+ # -------------------- CHAT TAB --------------------
123
+ with gr.Tab("Chat"):
124
+ with gr.Row():
125
+ with gr.Column():
126
+ # "Featured Models" Accordion
127
+ with gr.Accordion("Featured Models", open=False):
128
+ model_search = gr.Textbox(
129
+ label="Filter Featured Models",
130
+ placeholder="Search featured models...",
131
+ lines=1,
132
+ )
133
+ # Radio for selecting a featured model
134
+ featured_models = gr.Radio(
135
+ label="Pick a Featured Model",
136
+ choices=featured_models_list,
137
+ value=featured_models_list[0],
138
+ interactive=True,
139
+ )
140
+
141
+ # Function to filter the model list by search text
142
+ def filter_models(search_term):
143
+ filtered = [
144
+ m
145
+ for m in featured_models_list
146
+ if search_term.lower() in m.lower()
147
+ ]
148
+ return gr.update(choices=filtered)
149
+
150
+ # Update the radio choices when user enters text in the search box
151
+ model_search.change(
152
+ filter_models,
153
+ inputs=model_search,
154
+ outputs=featured_models,
155
+ )
156
+
157
+ # "Custom Model" text box
158
+ custom_model = gr.Textbox(
159
+ label="Custom Model",
160
+ placeholder="Paste a Hugging Face repo path, e.g. 'myuser/my-model'",
161
+ lines=1,
162
+ )
163
+ gr.Markdown(
164
+ "If you provide a custom model path above, it will override your featured model selection."
165
+ )
166
+
167
+ with gr.Column():
168
+ # Create the Gradio Chatbot
169
+ chatbot = gr.Chatbot(height=600, label="Chat Output")
170
+
171
+ # Additional controls for system prompt & generation parameters
172
+ with gr.Box():
173
+ system_message = gr.Textbox(
174
+ value="",
175
+ label="System message",
176
+ placeholder="System-level instruction or context here...",
177
+ )
178
+ max_tokens = gr.Slider(
179
+ minimum=1,
180
+ maximum=4096,
181
+ value=512,
182
+ step=1,
183
+ label="Max new tokens",
184
+ )
185
+ temperature = gr.Slider(
186
+ minimum=0.1,
187
+ maximum=4.0,
188
+ value=0.7,
189
+ step=0.1,
190
+ label="Temperature",
191
+ )
192
+ top_p = gr.Slider(
193
+ minimum=0.1,
194
+ maximum=1.0,
195
+ value=0.95,
196
+ step=0.05,
197
+ label="Top-P",
198
+ )
199
+ frequency_penalty = gr.Slider(
200
+ minimum=-2.0,
201
+ maximum=2.0,
202
+ value=0.0,
203
+ step=0.1,
204
+ label="Frequency Penalty",
205
+ )
206
+ seed = gr.Slider(
207
+ minimum=-1,
208
+ maximum=65535,
209
+ value=-1,
210
+ step=1,
211
+ label="Seed (-1 for random)",
212
+ )
213
+
214
+ # We will attach a ChatInterface-like set of controls manually.
215
+ # Keep track of conversation state
216
+ state = gr.State([]) # Holds conversation as a list of (user, assistant)
217
+
218
+ # Define "user" event function
219
+ def user_message(user_text, history):
220
+ """
221
+ When the user sends a message, add it to history as (user_text, "")
222
+ The assistant's response will fill the second part of the tuple later.
223
+ """
224
+ if not user_text:
225
+ return gr.update(), history
226
+ new_history = history + [(user_text, "")] # user question, empty answer
227
+ return gr.update(value=""), new_history
228
+
229
+ # Define "bot" event function
230
+ def bot_message(history, system_message, max_tokens, temperature, top_p,
231
+ frequency_penalty, seed, featured_models, custom_model):
232
+ """
233
+ Generate assistant reply given the entire chat history,
234
+ system prompt, and generation params. The function will stream
235
+ tokens from respond().
236
+ """
237
+ user_text = history[-1][0] if history else ""
238
+ # We'll call respond() as a generator, so we can stream back tokens.
239
+ bot_stream = respond(
240
+ message=user_text,
241
+ history=history[:-1],
242
+ system_message=system_message,
243
+ max_tokens=max_tokens,
244
+ temperature=temperature,
245
+ top_p=top_p,
246
+ frequency_penalty=frequency_penalty,
247
+ seed=seed,
248
+ featured_model=featured_models,
249
+ custom_model=custom_model,
250
+ )
251
+ # We'll build up the assistant's reply token by token
252
+ final_assistant_text = ""
253
+ for token in bot_stream:
254
+ final_assistant_text = token
255
+ # We yield partial updates to the chatbot
256
+ yield history[:-1] + [(user_text, final_assistant_text)]
257
+ # Once complete, update the conversation in state
258
+ history[-1] = (user_text, final_assistant_text)
259
+ yield history
260
+
261
+ # Textbox for the user to type a message
262
+ with gr.Row():
263
+ with gr.Column(scale=8):
264
+ user_textbox = gr.Textbox(
265
+ label="Your message",
266
+ placeholder="Type your question or prompt here...",
267
+ lines=2,
268
+ interactive=True,
269
+ )
270
+ with gr.Column(scale=2):
271
+ send_button = gr.Button(
272
+ value="Send",
273
+ variant="primary"
274
+ )
275
+
276
+ # When user clicks "Send", first call user_message(), then bot_message()
277
+ send_button.click(
278
+ fn=user_message,
279
+ inputs=[user_textbox, state],
280
+ outputs=[user_textbox, state],
281
+ ).then(
282
+ fn=bot_message,
283
+ inputs=[
284
+ state,
285
+ system_message,
286
+ max_tokens,
287
+ temperature,
288
+ top_p,
289
+ frequency_penalty,
290
+ seed,
291
+ featured_models,
292
+ custom_model,
293
+ ],
294
+ outputs=chatbot,
295
+ )
296
+
297
+ # -------------------- INFORMATION TAB --------------------
298
+ with gr.Tab("Information"):
299
+ # Put information about featured models
300
+ with gr.Accordion("Featured Models", open=False):
301
+ gr.HTML(
302
+ """
303
+ <table style="width:100%; text-align:center; margin:auto;">
304
+ <tr>
305
+ <th>Model Name</th>
306
+ <th>Description</th>
307
+ <th>Status</th>
308
+ </tr>
309
+ <tr>
310
+ <td>meta-llama/Llama-3.3-70B-Instruct</td>
311
+ <td>Powerful large model by Llama, fine-tuned to follow instructions.</td>
312
+ <td>✅</td>
313
+ </tr>
314
+ <tr>
315
+ <td>Qwen/Qwen2.5-7B-Instruct</td>
316
+ <td>Instruction-tuned LLM with good accuracy and speed.</td>
317
+ <td>✅</td>
318
+ </tr>
319
+ <tr>
320
+ <td>google/gemma-2-2b-it</td>
321
+ <td>Compact 2B parameter model for quick text generation tasks.</td>
322
+ <td>✅</td>
323
+ </tr>
324
+ <tr>
325
+ <td>microsoft/Phi-3-mini-4k-instruct</td>
326
+ <td>Small but effective model, optimized for instruction-based tasks.</td>
327
+ <td>✅</td>
328
+ </tr>
329
+ </table>
330
+ """
331
+ )
332
+
333
+ # Put general parameter info
334
+ with gr.Accordion("Parameters Overview", open=False):
335
+ gr.Markdown(
336
+ """
337
+ ## Parameters Overview
338
+ - **System Message**
339
+ This is a special prompt that sets the behavior or context for the AI.
340
+
341
+ - **Max New Tokens**
342
+ The maximum length of the AI's reply in tokens.
343
+
344
+ - **Temperature**
345
+ Controls how random or "creative" the model is. A higher value yields more unexpected outputs.
346
+
347
+ - **Top-P**
348
+ Nucleus sampling — only the tokens whose probabilities add up to `top_p` or higher are kept for generation.
349
+
350
+ - **Frequency Penalty**
351
+ Discourages the model from repeating tokens that already appeared.
352
+
353
+ - **Seed**
354
+ For reproducible outputs. If set to `-1`, a random seed is chosen each time.
355
+
356
+ ### Model Selection
357
+ - **Featured Models**
358
+ A curated set of recommended or widely-used LLMs you can pick from.
359
+ - **Custom Model**
360
+ If you have a specific Hugging Face repo (e.g. `some-user/my-cool-model`), paste it here to override.
361
+
362
+ ***
363
+ Feel free to experiment with different settings to see how they affect the response!
364
+ """
365
+ )
366
+
367
+ return demo
368
+
369
+ # Actually build and launch the app
370
  if __name__ == "__main__":
371
  print("Launching the demo application.")
372
+ demo = build_demo()
373
  demo.launch()