File size: 11,507 Bytes
038f313
 
4c18bfc
038f313
8696822
 
 
 
880ced6
 
e13eb1b
038f313
e13eb1b
038f313
 
 
 
e13eb1b
038f313
8696822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
038f313
 
e13eb1b
69b4a5f
038f313
 
 
3a64d68
98674ca
8696822
 
038f313
e13eb1b
52ad57a
 
 
 
8696822
 
 
e13eb1b
f7c4208
 
86297f5
52ad57a
 
98674ca
8696822
f7c4208
52ad57a
 
 
038f313
e13eb1b
8696822
f7c4208
 
e13eb1b
 
 
 
 
 
86297f5
e13eb1b
 
 
 
038f313
 
8696822
 
 
 
 
 
 
 
 
 
98674ca
 
8696822
038f313
b56d11c
f7c4208
52ad57a
e13eb1b
8696822
038f313
8696822
038f313
 
98674ca
 
86297f5
038f313
f7c4208
86297f5
8696822
b56d11c
9b9dccd
b56d11c
542c2ac
e13eb1b
f7c4208
8696822
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52ad57a
8696822
52ad57a
8696822
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
import gradio as gr
from openai import OpenAI
import os

# =============================
#     GLOBAL SETUP / CLIENT
# =============================

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

# =============================
#     MODEL CONFIG / LOGIC
# =============================

# Sample placeholder list of "featured" models for demonstration
featured_models_list = [
    "meta-llama/Llama-2-13B-chat-hf",
    "bigscience/bloom",
    "microsoft/DialoGPT-large",
    "OpenAssistant/oasst-sft-1-pythia-12b",
    "tiiuae/falcon-7b-instruct",
    "meta-llama/Llama-3.3-70B-Instruct"
]

def filter_featured_models(search_term: str):
    """
    Returns a list of models that contain the search term (case-insensitive).
    """
    filtered = [m for m in featured_models_list if search_term.lower() in m.lower()]
    return gr.update(choices=filtered)


def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    custom_model,
    selected_featured_model
):
    """
    This function handles the chatbot response. It takes in:
    - message: the user's new message
    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
    - system_message: the system prompt
    - max_tokens, temperature, top_p, frequency_penalty, seed: generation params
    - custom_model: user-provided custom model path/name
    - selected_featured_model: model chosen from the featured radio list
    """
    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Custom model: {custom_model}")
    print(f"Selected featured model: {selected_featured_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Construct the messages array required by the API
    messages = [{"role": "system", "content": system_message}] if system_message.strip() else []

    # Add conversation history to the context
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})

    # Determine which model to use:
    # 1) If custom_model is non-empty, it overrides everything.
    # 2) Otherwise, use the selected featured model from the radio button if available.
    # 3) If both are empty, fall back to the default.
    model_to_use = "meta-llama/Llama-3.3-70B-Instruct"  # Default
    if custom_model.strip() != "":
        model_to_use = custom_model.strip()
    elif selected_featured_model.strip() != "":
        model_to_use = selected_featured_model.strip()

    print(f"Model selected for inference: {model_to_use}")

    # Start building the streaming response
    response = ""
    print("Sending request to OpenAI API.")

    # Make the streaming request to the HF Inference API via openai-like client
    for message_chunk in client.chat.completions.create(
        model=model_to_use,
        max_tokens=max_tokens,
        stream=True,  # Stream the response
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        # Extract the token text from the response chunk
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}", flush=True)
        response += token_text
        # Yield the partial response to Gradio so it can display in real-time
        yield response

    print("Completed response generation.")

# =============================
#         MAIN UI
# =============================

def build_app():
    """
    Build the Gradio Blocks interface containing:
      - A Chat tab (ChatInterface)
      - A Featured Models tab
      - An Information tab
    """
    with gr.Blocks(theme="Nymbo/Nymbo_Theme") as main_interface:

        # We define a Gr.State to hold the user's chosen featured model
        selected_featured_model_state = gr.State("")

        with gr.Tab("Chat Interface"):
            gr.Markdown("## Serverless-TextGen-Hub")

            # Here we embed the ChatInterface for streaming conversation
            # We add extra inputs for "Selected Featured Model" as hidden,
            # so the user can't directly edit but it flows into respond().
            demo = gr.ChatInterface(
                fn=respond,
                additional_inputs=[
                    gr.Textbox(value="", label="System message", lines=2),
                    gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
                    gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
                    gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
                    gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty"),
                    gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)"),
                    gr.Textbox(value="", label="Custom Model", info="(Optional) Provide a custom HF model path"),
                    gr.Textbox(value="", label="Selected Featured Model (from tab)", visible=False),
                ],
                fill_height=True,
                chatbot=gr.Chatbot(height=600),
                theme="Nymbo/Nymbo_Theme",
            )

            # We want to connect the selected_featured_model_state to that hidden text box
            def set_featured_model_in_chatbox(val):
                return val

            # Whenever the selected_featured_model_state changes, update the hidden field in the ChatInterface
            selected_featured_model_state.change(
                fn=set_featured_model_in_chatbox,
                inputs=selected_featured_model_state,
                outputs=demo.additional_inputs[-1],  # The last additional input is the "Selected Featured Model"
            )

        # ==========================
        #   Featured Models Tab
        # ==========================
        with gr.Tab("Featured Models"):
            gr.Markdown("### Choose from our Featured Models")

            # A text box for searching/filtering
            model_search = gr.Textbox(
                label="Filter Models",
                placeholder="Search for a featured model..."
            )

            # A radio component listing the featured models (default to first)
            model_radio = gr.Radio(
                choices=featured_models_list,
                label="Select a model below",
                value=featured_models_list[0],
                interactive=True
            )

            # Define how to update the radio choices when the search box changes
            model_search.change(
                fn=filter_featured_models,
                inputs=model_search,
                outputs=model_radio
            )

            # Button to confirm the selection
            def select_featured_model(radio_val):
                """
                Updates the hidden state with the user-chosen featured model.
                """
                return radio_val

            choose_btn = gr.Button("Use this Featured Model", variant="primary")

            choose_btn.click(
                fn=select_featured_model,
                inputs=model_radio,
                outputs=selected_featured_model_state
            )

            gr.Markdown(
                """
                **Tip**: If you type a Custom Model in the "Chat Interface" tab, it overrides the
                featured model you selected here.
                """
            )

        # ==========================
        #   Information Tab
        # ==========================
        with gr.Tab("Information"):
            gr.Markdown("## Learn More About These Models and Parameters")

            with gr.Accordion("Featured Models (Table)", open=False):
                gr.HTML(
                    """
                    <p>Below is a small sample table showing some featured models.</p>
                    <table style="width:100%; text-align:center; margin:auto;">
                        <tr>
                            <th>Model Name</th>
                            <th>Type</th>
                            <th>Notes</th>
                        </tr>
                        <tr>
                            <td>meta-llama/Llama-2-13B-chat-hf</td>
                            <td>Chat</td>
                            <td>Good for multi-turn dialogue.</td>
                        </tr>
                        <tr>
                            <td>bigscience/bloom</td>
                            <td>Language Model</td>
                            <td>Large multilingual model.</td>
                        </tr>
                        <tr>
                            <td>microsoft/DialoGPT-large</td>
                            <td>Chat</td>
                            <td>Well-known smaller chat model.</td>
                        </tr>
                    </table>
                    """
                )

            with gr.Accordion("Parameters Overview", open=False):
                gr.Markdown(
                    """
                    ### Explanation of Key Parameters

                    - **System Message**: Provides context or initial instructions to the model.  
                    - **Max Tokens**: The maximum number of tokens (roughly pieces of words) in the generated response.  
                    - **Temperature**: Higher values produce more random/creative outputs, while lower values make the output more focused and deterministic.  
                    - **Top-P**: Controls nucleus sampling. The model considers only the tokens whose probability mass exceeds this value.  
                    - **Frequency Penalty**: Penalizes repeated tokens. Positive values (like 1.0) reduce repetition in the output. Negative values can increase repetition.  
                    - **Seed**: Determines reproducibility. Set it to a fixed integer for consistent results; `-1` is random each time.  
                    - **Custom Model**: Overwrites the featured model. Provide the Hugging Face path (e.g., `openai/whisper-base`) for your own usage.  

                    Use these settings to guide how the model generates text. If in doubt, stick to defaults and experiment in small increments.
                    """
                )

    return main_interface

# If run as a standalone script, just launch.
if __name__ == "__main__":
    print("Building and launching the Serverless-TextGen-Hub interface...")
    ui = build_app()
    ui.launch()