File size: 13,538 Bytes
038f313
fab24df
c5a20a4
038f313
62429d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
880ced6
 
e13eb1b
038f313
62429d1
038f313
 
 
 
e13eb1b
038f313
 
27c8b8d
 
 
038f313
 
 
3a64d68
98674ca
c5a20a4
038f313
e13eb1b
7255410
 
 
 
 
 
 
 
 
be3f346
e13eb1b
7255410
27c8b8d
 
 
 
 
be3f346
f7c4208
c5a20a4
52ad57a
 
038f313
62429d1
c5a20a4
d6c98d8
27c8b8d
c5a20a4
27c8b8d
d6c98d8
62429d1
27c8b8d
62429d1
27c8b8d
 
62429d1
27c8b8d
 
c5a20a4
27c8b8d
d6c98d8
27c8b8d
d6c98d8
c5a20a4
77298b9
 
62429d1
 
 
27c8b8d
62429d1
27c8b8d
62429d1
 
 
 
 
 
 
 
27c8b8d
 
 
 
62429d1
c5a20a4
62429d1
542c2ac
e13eb1b
f7c4208
62429d1
 
 
be3f346
62429d1
 
 
c5a20a4
62429d1
 
 
 
 
 
 
 
be3f346
62429d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be3f346
62429d1
 
 
 
 
be3f346
62429d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e7683ca
62429d1
 
 
 
 
 
 
 
 
769901b
62429d1
 
 
 
 
 
 
 
769901b
62429d1
 
 
 
 
 
 
 
be3f346
769901b
62429d1
 
 
 
 
 
 
be3f346
62429d1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be3f346
 
769901b
62429d1
 
 
77298b9
27c8b8d
77298b9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
import gradio as gr
from openai import OpenAI
import os

# -------------------
# SERVERLESS-TEXTGEN-HUB
# -------------------
#
# This version has been updated to include an "Information" tab above the Chat tab.
# The Information tab has two accordions:
#   - "Featured Models" which displays a simple table
#   - "Parameters Overview" which contains markdown describing the settings
#
# The Chat tab contains the existing chatbot UI.

# -------------------
# SETUP AND CONFIG
# -------------------

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI-like client (Hugging Face Inference API) with your token
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    custom_model
):
    """
    This function handles the chatbot response. It takes in:
    - message: the user's new message
    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
    - system_message: the system prompt
    - max_tokens: the maximum number of tokens to generate in the response
    - temperature: sampling temperature
    - top_p: top-p (nucleus) sampling
    - frequency_penalty: penalize repeated tokens in the output
    - seed: a fixed seed for reproducibility; -1 will mean 'random'
    - custom_model: the final model name in use, which may be set by selecting from the Featured Models radio or by typing a custom model
    """

    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Selected model (custom_model): {custom_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Construct the messages array required by the HF Inference API
    messages = [{"role": "system", "content": system_message}]
    print("Initial messages array constructed.")

    # Add conversation history to the context
    for val in history:
        user_part = val[0]  # Extract user message from the tuple
        assistant_part = val[1]  # Extract assistant message
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})
    print("Latest user message appended.")

    # If user provided a model, use that; otherwise, fall back to a default model
    model_to_use = custom_model.strip() if custom_model.strip() != "" else "meta-llama/Llama-3.3-70B-Instruct"
    print(f"Model selected for inference: {model_to_use}")

    # Start with an empty string to build the streamed response
    response_text = ""
    print("Sending request to Hugging Face Inference API via OpenAI-like client...")

    # Make the streaming request to the HF Inference API
    for message_chunk in client.chat.completions.create(
        model=model_to_use,              
        max_tokens=max_tokens,           
        stream=True,                     
        temperature=temperature,         
        top_p=top_p,                     
        frequency_penalty=frequency_penalty,
        seed=seed,                       
        messages=messages,
    ):
        # Extract the token text from the response chunk
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response_text += token_text
        # Yield the partial response to Gradio so it can display in real-time
        yield response_text

    print("Completed response generation.")

# ----------------------
# BUILDING THE INTERFACE
# ----------------------

# We will use a "Blocks" layout with two tabs:
#   1) "Information" tab, which shows helpful info and a table of "Featured Models"
#   2) "Chat" tab, which holds our ChatInterface and associated controls

with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    
    # -----------------
    # TAB: INFORMATION
    # -----------------
    with gr.Tab("Information"):
        # You can add instructions, disclaimers, or helpful text here
        gr.Markdown("## Welcome to Serverless-TextGen-Hub - Information")

        # Accordion for Featured Models (table)
        with gr.Accordion("Featured Models (WiP)", open=False):
            gr.HTML(
                """
                <p><a href="https://huggingface.co/models?inference=warm&pipeline_tag=chat&sort=trending" target="_blank">See all available text models on Hugging Face</a></p>
                <table style="width:100%; text-align:center; margin:auto;">
                    <tr>
                        <th>Model Name</th>
                        <th>Supported</th>
                        <th>Notes</th>
                    </tr>
                    <tr>
                        <td>meta-llama/Llama-3.3-70B-Instruct</td>
                        <td>✅</td>
                        <td>Default model, if none is provided in the 'Custom Model' box.</td>
                    </tr>
                    <tr>
                        <td>meta-llama/Llama-3.2-3B-Instruct</td>
                        <td>✅</td>
                        <td>Smaller Llama-based instruct model for faster responses.</td>
                    </tr>
                    <tr>
                        <td>microsoft/Phi-3.5-mini-instruct</td>
                        <td>✅</td>
                        <td>A smaller instruct model from Microsoft.</td>
                    </tr>
                    <tr>
                        <td>Qwen/Qwen2.5-72B-Instruct</td>
                        <td>✅</td>
                        <td>Large-scale Qwen-based model.</td>
                    </tr>
                </table>
                """
            )

        # Accordion for Parameters Overview
        with gr.Accordion("Parameters Overview", open=False):
            gr.Markdown(
                """
                **Here is a brief overview of the main parameters for text generation:**

                - **Max Tokens**: The maximum number of tokens (think of these as word-pieces) the model will generate in its response.
                - **Temperature**: Controls how "creative" or random the output is. Lower values = more deterministic, higher values = more varied.
                - **Top-P**: Similar to temperature, but uses nucleus sampling. Top-P defines the probability mass of the tokens to sample from. For example, `top_p=0.9` means "use the top 90% probable tokens."
                - **Frequency Penalty**: A higher penalty discourages repeated tokens, helping reduce repetitive answers.
                - **Seed**: You can set a seed for deterministic results. `-1` means random each time.

                **Featured Models** can also be selected. If you want to override the model, you may specify a custom Hugging Face model path in the "Custom Model" text box.

                ---
                If you are new to text-generation parameters, the defaults are a great place to start!
                """
            )
    
    # -----------
    # TAB: CHAT
    # -----------
    with gr.Tab("Chat"):
        gr.Markdown("## Chat with the TextGen Model")

        # Create a Chatbot component with a specified height
        chatbot = gr.Chatbot(height=600)
        print("Chatbot interface created.")

        # Create textboxes and sliders for system prompt, tokens, and other parameters
        system_message_box = gr.Textbox(
            value="",
            label="System message",
            info="You can use this to provide instructions or context to the assistant. Leave empty if not needed."
        )

        max_tokens_slider = gr.Slider(
            minimum=1,
            maximum=4096,
            value=512,
            step=1,
            label="Max new tokens",
            info="Controls the maximum length of the output. Keep an eye on your usage!"
        )

        temperature_slider = gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature",
            info="Controls creativity. Higher values = more random replies, lower = more deterministic."
        )

        top_p_slider = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-P",
            info="Use nucleus sampling with probability mass cutoff. 1.0 includes all tokens."
        )

        frequency_penalty_slider = gr.Slider(
            minimum=-2.0,
            maximum=2.0,
            value=0.0,
            step=0.1,
            label="Frequency Penalty",
            info="Penalize repeated tokens to avoid repetition in output."
        )

        seed_slider = gr.Slider(
            minimum=-1,
            maximum=65535,
            value=-1,
            step=1,
            label="Seed (-1 for random)",
            info="Fixing a seed (0 to 65535) can make results reproducible. -1 picks a random seed each time."
        )

        # The custom_model_box is what the respond function sees as "custom_model"
        custom_model_box = gr.Textbox(
            value="",
            label="Custom Model",
            info="(Optional) Provide a custom Hugging Face model path. Overrides any selected featured model."
        )

        # Function to update the custom model box when a featured model is selected
        def set_custom_model_from_radio(selected):
            print(f"Featured model selected: {selected}")
            return selected

        print("ChatInterface object created.")

        # The main ChatInterface call
        chat_interface = gr.ChatInterface(
            fn=respond,  # The function to handle responses
            additional_inputs=[
                system_message_box,
                max_tokens_slider,
                temperature_slider,
                top_p_slider,
                frequency_penalty_slider,
                seed_slider,
                custom_model_box
            ],
            fill_height=True,  # Let the chatbot fill the container height
            chatbot=chatbot,   # The Chatbot UI component
            theme="Nymbo/Nymbo_Theme",
        )

        print("Gradio interface for Chat created.")

        # -----------
        # ADDING THE "FEATURED MODELS" ACCORDION (Same logic as before)
        # -----------
        with gr.Accordion("Featured Models", open=False):
            model_search_box = gr.Textbox(
                label="Filter Models",
                placeholder="Search for a featured model...",
                lines=1
            )
            print("Model search box created.")

            # Sample list of popular text models
            models_list = [
                "meta-llama/Llama-3.3-70B-Instruct",
                "meta-llama/Llama-3.2-3B-Instruct",
                "meta-llama/Llama-3.2-1B-Instruct",
                "meta-llama/Llama-3.1-8B-Instruct",
                "NousResearch/Hermes-3-Llama-3.1-8B",
                "google/gemma-2-27b-it",
                "google/gemma-2-9b-it",
                "google/gemma-2-2b-it",
                "mistralai/Mistral-Nemo-Instruct-2407",
                "mistralai/Mixtral-8x7B-Instruct-v0.1",
                "mistralai/Mistral-7B-Instruct-v0.3",
                "Qwen/Qwen2.5-72B-Instruct",
                "Qwen/QwQ-32B-Preview",
                "PowerInfer/SmallThinker-3B-Preview",
                "HuggingFaceTB/SmolLM2-1.7B-Instruct",
                "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                "microsoft/Phi-3.5-mini-instruct",
            ]
            print("Models list initialized.")

            featured_model_radio = gr.Radio(
                label="Select a model below",
                choices=models_list,
                value="meta-llama/Llama-3.3-70B-Instruct",
                interactive=True
            )
            print("Featured models radio button created.")

            def filter_models(search_term):
                print(f"Filtering models with search term: {search_term}")
                filtered = [m for m in models_list if search_term.lower() in m.lower()]
                print(f"Filtered models: {filtered}")
                return gr.update(choices=filtered)

            model_search_box.change(
                fn=filter_models,
                inputs=model_search_box,
                outputs=featured_model_radio
            )
            print("Model search box change event linked.")

            featured_model_radio.change(
                fn=set_custom_model_from_radio,
                inputs=featured_model_radio,
                outputs=custom_model_box
            )
            print("Featured model radio button change event linked.")

print("Gradio interface initialized.")

# ------------------------
# MAIN ENTRY POINT
# ------------------------
if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()