File size: 15,956 Bytes
a430d0d
038f313
 
 
a430d0d
038f313
 
 
a430d0d
038f313
 
 
 
 
 
 
 
 
 
 
 
 
3a64d68
a430d0d
 
 
038f313
5b1509d
a430d0d
5b1509d
a430d0d
 
 
 
 
 
 
 
 
 
 
 
038f313
 
 
050af7a
5b1509d
a430d0d
 
5b1509d
a430d0d
 
 
 
 
 
 
 
 
 
5b1509d
 
038f313
a430d0d
038f313
 
5b1509d
 
 
 
 
 
 
 
038f313
a430d0d
038f313
 
a430d0d
038f313
a430d0d
5b1509d
a430d0d
038f313
a430d0d
038f313
 
a430d0d
 
038f313
 
5b1509d
 
 
a430d0d
038f313
 
 
3a64d68
038f313
a430d0d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
038f313
 
a430d0d
038f313
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
import os
import gradio as gr
from openai import OpenAI

# Load your Hugging Face Inference API token from environment
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI-like client that points to the HF Inference endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    featured_model,   # Selected from "Featured Models" radio
    custom_model      # Optional user-provided custom model path
):
    """
    Respond to user messages using the Hugging Face Inference API with OpenAI-like syntax.

    Parameters:
    - message (str): The latest user message
    - history (list of tuples): The conversation history [(user_msg, assistant_msg), ...]
    - system_message (str): System-level instruction or context
    - max_tokens (int): Max tokens to generate
    - temperature (float): Sampling temperature
    - top_p (float): Nucleus sampling (top-p)
    - frequency_penalty (float): Penalize repeated tokens
    - seed (int): Fixed seed; if -1 => random
    - featured_model (str): The featured model name selected in the UI
    - custom_model (str): A custom model path (HF repo) provided by the user
    """
    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Featured Model (chosen): {featured_model}")
    print(f"Custom Model (if any): {custom_model}")

    # Decide which model to use. If the user typed a custom model, we use that.
    # Otherwise, we use the featured model they picked from the radio.
    if custom_model.strip():
        model_to_use = custom_model.strip()
    else:
        model_to_use = featured_model

    print(f"Final model to use: {model_to_use}")

    # Convert seed to None if -1 => means random
    if seed == -1:
        seed = None

    # Prepare the conversation
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Add the latest user message
    messages.append({"role": "user", "content": message})

    # Generate the response in a streaming manner
    response = ""
    print("Sending request to HF Inference API via OpenAI-like client.")
    for message_chunk in client.chat.completions.create(
        model=model_to_use,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response += token_text
        # Yield partial responses to get streaming in Gradio
        yield response

    print("Completed response generation.")


# ----------------------------
# DEFINE THE GRADIO INTERFACE
# ----------------------------
def build_demo():
    """
    Build the entire Gradio Blocks interface, featuring:
      - A Tab for the chatbot (with featured models, custom model)
      - An Information tab with model table, parameter overview, etc.
    """
    # Define your placeholder featured models
    featured_models_list = [
        "meta-llama/Llama-3.3-70B-Instruct",
        "Qwen/Qwen2.5-7B-Instruct",
        "google/gemma-2-2b-it",
        "microsoft/Phi-3-mini-4k-instruct",
    ]

    with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
        gr.Markdown("## Serverless Text Generation Hub")

        with gr.Tabs():
            # -------------------- CHAT TAB --------------------
            with gr.Tab("Chat"):
                with gr.Row():
                    with gr.Column():
                        # "Featured Models" Accordion
                        with gr.Accordion("Featured Models", open=False):
                            model_search = gr.Textbox(
                                label="Filter Featured Models",
                                placeholder="Search featured models...",
                                lines=1,
                            )
                            # Radio for selecting a featured model
                            featured_models = gr.Radio(
                                label="Pick a Featured Model",
                                choices=featured_models_list,
                                value=featured_models_list[0],
                                interactive=True,
                            )

                            # Function to filter the model list by search text
                            def filter_models(search_term):
                                filtered = [
                                    m
                                    for m in featured_models_list
                                    if search_term.lower() in m.lower()
                                ]
                                return gr.update(choices=filtered)

                            # Update the radio choices when user enters text in the search box
                            model_search.change(
                                filter_models,
                                inputs=model_search,
                                outputs=featured_models,
                            )

                        # "Custom Model" text box
                        custom_model = gr.Textbox(
                            label="Custom Model",
                            placeholder="Paste a Hugging Face repo path, e.g. 'myuser/my-model'",
                            lines=1,
                        )
                        gr.Markdown(
                            "If you provide a custom model path above, it will override your featured model selection."
                        )

                    with gr.Column():
                        # Create the Gradio Chatbot
                        chatbot = gr.Chatbot(height=600, label="Chat Output")

                # Additional controls for system prompt & generation parameters
                with gr.Box():
                    system_message = gr.Textbox(
                        value="",
                        label="System message",
                        placeholder="System-level instruction or context here...",
                    )
                    max_tokens = gr.Slider(
                        minimum=1,
                        maximum=4096,
                        value=512,
                        step=1,
                        label="Max new tokens",
                    )
                    temperature = gr.Slider(
                        minimum=0.1,
                        maximum=4.0,
                        value=0.7,
                        step=0.1,
                        label="Temperature",
                    )
                    top_p = gr.Slider(
                        minimum=0.1,
                        maximum=1.0,
                        value=0.95,
                        step=0.05,
                        label="Top-P",
                    )
                    frequency_penalty = gr.Slider(
                        minimum=-2.0,
                        maximum=2.0,
                        value=0.0,
                        step=0.1,
                        label="Frequency Penalty",
                    )
                    seed = gr.Slider(
                        minimum=-1,
                        maximum=65535,
                        value=-1,
                        step=1,
                        label="Seed (-1 for random)",
                    )

                # We will attach a ChatInterface-like set of controls manually.
                # Keep track of conversation state
                state = gr.State([])  # Holds conversation as a list of (user, assistant)

                # Define "user" event function
                def user_message(user_text, history):
                    """
                    When the user sends a message, add it to history as (user_text, "")
                    The assistant's response will fill the second part of the tuple later.
                    """
                    if not user_text:
                        return gr.update(), history
                    new_history = history + [(user_text, "")]  # user question, empty answer
                    return gr.update(value=""), new_history

                # Define "bot" event function
                def bot_message(history, system_message, max_tokens, temperature, top_p,
                                frequency_penalty, seed, featured_models, custom_model):
                    """
                    Generate assistant reply given the entire chat history,
                    system prompt, and generation params. The function will stream
                    tokens from respond().
                    """
                    user_text = history[-1][0] if history else ""
                    # We'll call respond() as a generator, so we can stream back tokens.
                    bot_stream = respond(
                        message=user_text,
                        history=history[:-1],
                        system_message=system_message,
                        max_tokens=max_tokens,
                        temperature=temperature,
                        top_p=top_p,
                        frequency_penalty=frequency_penalty,
                        seed=seed,
                        featured_model=featured_models,
                        custom_model=custom_model,
                    )
                    # We'll build up the assistant's reply token by token
                    final_assistant_text = ""
                    for token in bot_stream:
                        final_assistant_text = token
                        # We yield partial updates to the chatbot
                        yield history[:-1] + [(user_text, final_assistant_text)]
                    # Once complete, update the conversation in state
                    history[-1] = (user_text, final_assistant_text)
                    yield history

                # Textbox for the user to type a message
                with gr.Row():
                    with gr.Column(scale=8):
                        user_textbox = gr.Textbox(
                            label="Your message",
                            placeholder="Type your question or prompt here...",
                            lines=2,
                            interactive=True,
                        )
                    with gr.Column(scale=2):
                        send_button = gr.Button(
                            value="Send",
                            variant="primary"
                        )

                # When user clicks "Send", first call user_message(), then bot_message()
                send_button.click(
                    fn=user_message,
                    inputs=[user_textbox, state],
                    outputs=[user_textbox, state],
                ).then(
                    fn=bot_message,
                    inputs=[
                        state,
                        system_message,
                        max_tokens,
                        temperature,
                        top_p,
                        frequency_penalty,
                        seed,
                        featured_models,
                        custom_model,
                    ],
                    outputs=chatbot,
                )

            # -------------------- INFORMATION TAB --------------------
            with gr.Tab("Information"):
                # Put information about featured models
                with gr.Accordion("Featured Models", open=False):
                    gr.HTML(
                        """
                        <table style="width:100%; text-align:center; margin:auto;">
                            <tr>
                                <th>Model Name</th>
                                <th>Description</th>
                                <th>Status</th>
                            </tr>
                            <tr>
                                <td>meta-llama/Llama-3.3-70B-Instruct</td>
                                <td>Powerful large model by Llama, fine-tuned to follow instructions.</td>
                                <td>✅</td>
                            </tr>
                            <tr>
                                <td>Qwen/Qwen2.5-7B-Instruct</td>
                                <td>Instruction-tuned LLM with good accuracy and speed.</td>
                                <td>✅</td>
                            </tr>
                            <tr>
                                <td>google/gemma-2-2b-it</td>
                                <td>Compact 2B parameter model for quick text generation tasks.</td>
                                <td>✅</td>
                            </tr>
                            <tr>
                                <td>microsoft/Phi-3-mini-4k-instruct</td>
                                <td>Small but effective model, optimized for instruction-based tasks.</td>
                                <td>✅</td>
                            </tr>
                        </table>
                        """
                    )

                # Put general parameter info
                with gr.Accordion("Parameters Overview", open=False):
                    gr.Markdown(
                        """
                        ## Parameters Overview
                        - **System Message**  
                          This is a special prompt that sets the behavior or context for the AI.  
                        
                        - **Max New Tokens**  
                          The maximum length of the AI's reply in tokens.  
                        
                        - **Temperature**  
                          Controls how random or "creative" the model is. A higher value yields more unexpected outputs.  
                        
                        - **Top-P**  
                          Nucleus sampling — only the tokens whose probabilities add up to `top_p` or higher are kept for generation.  
                        
                        - **Frequency Penalty**  
                          Discourages the model from repeating tokens that already appeared.  
                        
                        - **Seed**  
                          For reproducible outputs. If set to `-1`, a random seed is chosen each time.  
                        
                        ### Model Selection
                        - **Featured Models**  
                          A curated set of recommended or widely-used LLMs you can pick from.  
                        - **Custom Model**  
                          If you have a specific Hugging Face repo (e.g. `some-user/my-cool-model`), paste it here to override.  
                        
                        ***
                        Feel free to experiment with different settings to see how they affect the response!
                        """
                    )

    return demo

# Actually build and launch the app
if __name__ == "__main__":
    print("Launching the demo application.")
    demo = build_demo()
    demo.launch()