Nymbo commited on
Commit
4c18bfc
·
verified ·
1 Parent(s): a430d0d
Files changed (1) hide show
  1. app.py +61 -310
app.py CHANGED
@@ -1,12 +1,12 @@
1
- import os
2
  import gradio as gr
3
  from openai import OpenAI
 
4
 
5
- # Load your Hugging Face Inference API token from environment
6
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
7
  print("Access token loaded.")
8
 
9
- # Initialize the OpenAI-like client that points to the HF Inference endpoint
10
  client = OpenAI(
11
  base_url="https://api-inference.huggingface.co/v1/",
12
  api_key=ACCESS_TOKEN,
@@ -21,48 +21,34 @@ def respond(
21
  temperature,
22
  top_p,
23
  frequency_penalty,
24
- seed,
25
- featured_model, # Selected from "Featured Models" radio
26
- custom_model # Optional user-provided custom model path
27
  ):
28
  """
29
- Respond to user messages using the Hugging Face Inference API with OpenAI-like syntax.
30
-
31
- Parameters:
32
- - message (str): The latest user message
33
- - history (list of tuples): The conversation history [(user_msg, assistant_msg), ...]
34
- - system_message (str): System-level instruction or context
35
- - max_tokens (int): Max tokens to generate
36
- - temperature (float): Sampling temperature
37
- - top_p (float): Nucleus sampling (top-p)
38
- - frequency_penalty (float): Penalize repeated tokens
39
- - seed (int): Fixed seed; if -1 => random
40
- - featured_model (str): The featured model name selected in the UI
41
- - custom_model (str): A custom model path (HF repo) provided by the user
42
  """
 
43
  print(f"Received message: {message}")
44
  print(f"History: {history}")
45
  print(f"System message: {system_message}")
46
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
47
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
48
- print(f"Featured Model (chosen): {featured_model}")
49
- print(f"Custom Model (if any): {custom_model}")
50
 
51
- # Decide which model to use. If the user typed a custom model, we use that.
52
- # Otherwise, we use the featured model they picked from the radio.
53
- if custom_model.strip():
54
- model_to_use = custom_model.strip()
55
- else:
56
- model_to_use = featured_model
57
-
58
- print(f"Final model to use: {model_to_use}")
59
-
60
- # Convert seed to None if -1 => means random
61
  if seed == -1:
62
  seed = None
63
 
64
- # Prepare the conversation
65
  messages = [{"role": "system", "content": system_message}]
 
 
66
  for val in history:
67
  user_part = val[0]
68
  assistant_part = val[1]
@@ -73,301 +59,66 @@ def respond(
73
  messages.append({"role": "assistant", "content": assistant_part})
74
  print(f"Added assistant message to context: {assistant_part}")
75
 
76
- # Add the latest user message
77
  messages.append({"role": "user", "content": message})
78
 
79
- # Generate the response in a streaming manner
80
  response = ""
81
- print("Sending request to HF Inference API via OpenAI-like client.")
 
 
82
  for message_chunk in client.chat.completions.create(
83
- model=model_to_use,
84
  max_tokens=max_tokens,
85
- stream=True,
86
  temperature=temperature,
87
  top_p=top_p,
88
- frequency_penalty=frequency_penalty,
89
- seed=seed,
90
  messages=messages,
91
  ):
 
92
  token_text = message_chunk.choices[0].delta.content
93
  print(f"Received token: {token_text}")
94
  response += token_text
95
- # Yield partial responses to get streaming in Gradio
96
  yield response
97
 
98
  print("Completed response generation.")
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # ----------------------------
102
- # DEFINE THE GRADIO INTERFACE
103
- # ----------------------------
104
- def build_demo():
105
- """
106
- Build the entire Gradio Blocks interface, featuring:
107
- - A Tab for the chatbot (with featured models, custom model)
108
- - An Information tab with model table, parameter overview, etc.
109
- """
110
- # Define your placeholder featured models
111
- featured_models_list = [
112
- "meta-llama/Llama-3.3-70B-Instruct",
113
- "Qwen/Qwen2.5-7B-Instruct",
114
- "google/gemma-2-2b-it",
115
- "microsoft/Phi-3-mini-4k-instruct",
116
- ]
117
-
118
- with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
119
- gr.Markdown("## Serverless Text Generation Hub")
120
-
121
- with gr.Tabs():
122
- # -------------------- CHAT TAB --------------------
123
- with gr.Tab("Chat"):
124
- with gr.Row():
125
- with gr.Column():
126
- # "Featured Models" Accordion
127
- with gr.Accordion("Featured Models", open=False):
128
- model_search = gr.Textbox(
129
- label="Filter Featured Models",
130
- placeholder="Search featured models...",
131
- lines=1,
132
- )
133
- # Radio for selecting a featured model
134
- featured_models = gr.Radio(
135
- label="Pick a Featured Model",
136
- choices=featured_models_list,
137
- value=featured_models_list[0],
138
- interactive=True,
139
- )
140
-
141
- # Function to filter the model list by search text
142
- def filter_models(search_term):
143
- filtered = [
144
- m
145
- for m in featured_models_list
146
- if search_term.lower() in m.lower()
147
- ]
148
- return gr.update(choices=filtered)
149
-
150
- # Update the radio choices when user enters text in the search box
151
- model_search.change(
152
- filter_models,
153
- inputs=model_search,
154
- outputs=featured_models,
155
- )
156
-
157
- # "Custom Model" text box
158
- custom_model = gr.Textbox(
159
- label="Custom Model",
160
- placeholder="Paste a Hugging Face repo path, e.g. 'myuser/my-model'",
161
- lines=1,
162
- )
163
- gr.Markdown(
164
- "If you provide a custom model path above, it will override your featured model selection."
165
- )
166
-
167
- with gr.Column():
168
- # Create the Gradio Chatbot
169
- chatbot = gr.Chatbot(height=600, label="Chat Output")
170
-
171
- # Additional controls for system prompt & generation parameters
172
- with gr.Box():
173
- system_message = gr.Textbox(
174
- value="",
175
- label="System message",
176
- placeholder="System-level instruction or context here...",
177
- )
178
- max_tokens = gr.Slider(
179
- minimum=1,
180
- maximum=4096,
181
- value=512,
182
- step=1,
183
- label="Max new tokens",
184
- )
185
- temperature = gr.Slider(
186
- minimum=0.1,
187
- maximum=4.0,
188
- value=0.7,
189
- step=0.1,
190
- label="Temperature",
191
- )
192
- top_p = gr.Slider(
193
- minimum=0.1,
194
- maximum=1.0,
195
- value=0.95,
196
- step=0.05,
197
- label="Top-P",
198
- )
199
- frequency_penalty = gr.Slider(
200
- minimum=-2.0,
201
- maximum=2.0,
202
- value=0.0,
203
- step=0.1,
204
- label="Frequency Penalty",
205
- )
206
- seed = gr.Slider(
207
- minimum=-1,
208
- maximum=65535,
209
- value=-1,
210
- step=1,
211
- label="Seed (-1 for random)",
212
- )
213
-
214
- # We will attach a ChatInterface-like set of controls manually.
215
- # Keep track of conversation state
216
- state = gr.State([]) # Holds conversation as a list of (user, assistant)
217
-
218
- # Define "user" event function
219
- def user_message(user_text, history):
220
- """
221
- When the user sends a message, add it to history as (user_text, "")
222
- The assistant's response will fill the second part of the tuple later.
223
- """
224
- if not user_text:
225
- return gr.update(), history
226
- new_history = history + [(user_text, "")] # user question, empty answer
227
- return gr.update(value=""), new_history
228
-
229
- # Define "bot" event function
230
- def bot_message(history, system_message, max_tokens, temperature, top_p,
231
- frequency_penalty, seed, featured_models, custom_model):
232
- """
233
- Generate assistant reply given the entire chat history,
234
- system prompt, and generation params. The function will stream
235
- tokens from respond().
236
- """
237
- user_text = history[-1][0] if history else ""
238
- # We'll call respond() as a generator, so we can stream back tokens.
239
- bot_stream = respond(
240
- message=user_text,
241
- history=history[:-1],
242
- system_message=system_message,
243
- max_tokens=max_tokens,
244
- temperature=temperature,
245
- top_p=top_p,
246
- frequency_penalty=frequency_penalty,
247
- seed=seed,
248
- featured_model=featured_models,
249
- custom_model=custom_model,
250
- )
251
- # We'll build up the assistant's reply token by token
252
- final_assistant_text = ""
253
- for token in bot_stream:
254
- final_assistant_text = token
255
- # We yield partial updates to the chatbot
256
- yield history[:-1] + [(user_text, final_assistant_text)]
257
- # Once complete, update the conversation in state
258
- history[-1] = (user_text, final_assistant_text)
259
- yield history
260
-
261
- # Textbox for the user to type a message
262
- with gr.Row():
263
- with gr.Column(scale=8):
264
- user_textbox = gr.Textbox(
265
- label="Your message",
266
- placeholder="Type your question or prompt here...",
267
- lines=2,
268
- interactive=True,
269
- )
270
- with gr.Column(scale=2):
271
- send_button = gr.Button(
272
- value="Send",
273
- variant="primary"
274
- )
275
-
276
- # When user clicks "Send", first call user_message(), then bot_message()
277
- send_button.click(
278
- fn=user_message,
279
- inputs=[user_textbox, state],
280
- outputs=[user_textbox, state],
281
- ).then(
282
- fn=bot_message,
283
- inputs=[
284
- state,
285
- system_message,
286
- max_tokens,
287
- temperature,
288
- top_p,
289
- frequency_penalty,
290
- seed,
291
- featured_models,
292
- custom_model,
293
- ],
294
- outputs=chatbot,
295
- )
296
-
297
- # -------------------- INFORMATION TAB --------------------
298
- with gr.Tab("Information"):
299
- # Put information about featured models
300
- with gr.Accordion("Featured Models", open=False):
301
- gr.HTML(
302
- """
303
- <table style="width:100%; text-align:center; margin:auto;">
304
- <tr>
305
- <th>Model Name</th>
306
- <th>Description</th>
307
- <th>Status</th>
308
- </tr>
309
- <tr>
310
- <td>meta-llama/Llama-3.3-70B-Instruct</td>
311
- <td>Powerful large model by Llama, fine-tuned to follow instructions.</td>
312
- <td>✅</td>
313
- </tr>
314
- <tr>
315
- <td>Qwen/Qwen2.5-7B-Instruct</td>
316
- <td>Instruction-tuned LLM with good accuracy and speed.</td>
317
- <td>✅</td>
318
- </tr>
319
- <tr>
320
- <td>google/gemma-2-2b-it</td>
321
- <td>Compact 2B parameter model for quick text generation tasks.</td>
322
- <td>✅</td>
323
- </tr>
324
- <tr>
325
- <td>microsoft/Phi-3-mini-4k-instruct</td>
326
- <td>Small but effective model, optimized for instruction-based tasks.</td>
327
- <td>✅</td>
328
- </tr>
329
- </table>
330
- """
331
- )
332
-
333
- # Put general parameter info
334
- with gr.Accordion("Parameters Overview", open=False):
335
- gr.Markdown(
336
- """
337
- ## Parameters Overview
338
- - **System Message**
339
- This is a special prompt that sets the behavior or context for the AI.
340
-
341
- - **Max New Tokens**
342
- The maximum length of the AI's reply in tokens.
343
-
344
- - **Temperature**
345
- Controls how random or "creative" the model is. A higher value yields more unexpected outputs.
346
-
347
- - **Top-P**
348
- Nucleus sampling — only the tokens whose probabilities add up to `top_p` or higher are kept for generation.
349
-
350
- - **Frequency Penalty**
351
- Discourages the model from repeating tokens that already appeared.
352
-
353
- - **Seed**
354
- For reproducible outputs. If set to `-1`, a random seed is chosen each time.
355
-
356
- ### Model Selection
357
- - **Featured Models**
358
- A curated set of recommended or widely-used LLMs you can pick from.
359
- - **Custom Model**
360
- If you have a specific Hugging Face repo (e.g. `some-user/my-cool-model`), paste it here to override.
361
-
362
- ***
363
- Feel free to experiment with different settings to see how they affect the response!
364
- """
365
- )
366
-
367
- return demo
368
-
369
- # Actually build and launch the app
370
  if __name__ == "__main__":
371
  print("Launching the demo application.")
372
- demo = build_demo()
373
  demo.launch()
 
 
1
  import gradio as gr
2
  from openai import OpenAI
3
+ import os
4
 
5
+ # Retrieve the access token from the environment variable
6
  ACCESS_TOKEN = os.getenv("HF_TOKEN")
7
  print("Access token loaded.")
8
 
9
+ # Initialize the OpenAI client with the Hugging Face Inference API endpoint
10
  client = OpenAI(
11
  base_url="https://api-inference.huggingface.co/v1/",
12
  api_key=ACCESS_TOKEN,
 
21
  temperature,
22
  top_p,
23
  frequency_penalty,
24
+ seed
 
 
25
  ):
26
  """
27
+ This function handles the chatbot response. It takes in:
28
+ - message: the user's new message
29
+ - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
30
+ - system_message: the system prompt
31
+ - max_tokens: the maximum number of tokens to generate in the response
32
+ - temperature: sampling temperature
33
+ - top_p: top-p (nucleus) sampling
34
+ - frequency_penalty: penalize repeated tokens in the output
35
+ - seed: a fixed seed for reproducibility; -1 will mean 'random'
 
 
 
 
36
  """
37
+
38
  print(f"Received message: {message}")
39
  print(f"History: {history}")
40
  print(f"System message: {system_message}")
41
  print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
42
  print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
 
 
43
 
44
+ # Convert seed to None if -1 (meaning random)
 
 
 
 
 
 
 
 
 
45
  if seed == -1:
46
  seed = None
47
 
48
+ # Construct the messages array required by the API
49
  messages = [{"role": "system", "content": system_message}]
50
+
51
+ # Add conversation history to the context
52
  for val in history:
53
  user_part = val[0]
54
  assistant_part = val[1]
 
59
  messages.append({"role": "assistant", "content": assistant_part})
60
  print(f"Added assistant message to context: {assistant_part}")
61
 
62
+ # Append the latest user message
63
  messages.append({"role": "user", "content": message})
64
 
65
+ # Start with an empty string to build the response as tokens stream in
66
  response = ""
67
+ print("Sending request to OpenAI API.")
68
+
69
+ # Make the streaming request to the HF Inference API via openai-like client
70
  for message_chunk in client.chat.completions.create(
71
+ model="meta-llama/Llama-3.3-70B-Instruct", # You can update this to your specific model
72
  max_tokens=max_tokens,
73
+ stream=True, # Stream the response
74
  temperature=temperature,
75
  top_p=top_p,
76
+ frequency_penalty=frequency_penalty, # <-- NEW
77
+ seed=seed, # <-- NEW
78
  messages=messages,
79
  ):
80
+ # Extract the token text from the response chunk
81
  token_text = message_chunk.choices[0].delta.content
82
  print(f"Received token: {token_text}")
83
  response += token_text
 
84
  yield response
85
 
86
  print("Completed response generation.")
87
 
88
+ # Create a Chatbot component with a specified height
89
+ chatbot = gr.Chatbot(height=600)
90
+ print("Chatbot interface created.")
91
+
92
+ # Create the Gradio ChatInterface
93
+ # We add two new sliders for Frequency Penalty and Seed
94
+ demo = gr.ChatInterface(
95
+ respond,
96
+ additional_inputs=[
97
+ gr.Textbox(value="", label="System message"),
98
+ gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
99
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
100
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
101
+ gr.Slider(
102
+ minimum=-2.0,
103
+ maximum=2.0,
104
+ value=0.0,
105
+ step=0.1,
106
+ label="Frequency Penalty"
107
+ ),
108
+ gr.Slider(
109
+ minimum=-1,
110
+ maximum=65535, # Arbitrary upper limit for demonstration
111
+ value=-1,
112
+ step=1,
113
+ label="Seed (-1 for random)"
114
+ ),
115
+ ],
116
+ fill_height=True,
117
+ chatbot=chatbot,
118
+ theme="Nymbo/Nymbo_Theme",
119
+ )
120
+ print("Gradio interface initialized.")
121
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  if __name__ == "__main__":
123
  print("Launching the demo application.")
 
124
  demo.launch()