Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Nymbo commited on Jan 3

Commit

a430d0d

verified ·

1 Parent(s): 5b1509d

adding custom models support, featured models tab, information tab, better model selection logic

Browse files

Files changed (1) hide show

app.py +310 -61

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import gradio as gr
 from openai import OpenAI
-import os
-# Retrieve the access token from the environment variable
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
 print("Access token loaded.")
-# Initialize the OpenAI client with the Hugging Face Inference API endpoint
 client = OpenAI(
     base_url="https://api-inference.huggingface.co/v1/",
     api_key=ACCESS_TOKEN,
@@ -21,34 +21,48 @@ def respond(
     temperature,
     top_p,
     frequency_penalty,
-    seed
 ):
     """
-    This function handles the chatbot response. It takes in:
-    - message: the user's new message
-    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
-    - system_message: the system prompt
-    - max_tokens: the maximum number of tokens to generate in the response
-    - temperature: sampling temperature
-    - top_p: top-p (nucleus) sampling
-    - frequency_penalty: penalize repeated tokens in the output
-    - seed: a fixed seed for reproducibility; -1 will mean 'random'
-    """
     print(f"Received message: {message}")
     print(f"History: {history}")
     print(f"System message: {system_message}")
     print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
-    # Convert seed to None if -1 (meaning random)
     if seed == -1:
         seed = None
-    # Construct the messages array required by the API
     messages = [{"role": "system", "content": system_message}]
-    # Add conversation history to the context
     for val in history:
         user_part = val[0]
         assistant_part = val[1]
@@ -59,66 +73,301 @@ def respond(
             messages.append({"role": "assistant", "content": assistant_part})
             print(f"Added assistant message to context: {assistant_part}")
-    # Append the latest user message
     messages.append({"role": "user", "content": message})
-    # Start with an empty string to build the response as tokens stream in
     response = ""
-    print("Sending request to OpenAI API.")
-    # Make the streaming request to the HF Inference API via openai-like client
     for message_chunk in client.chat.completions.create(
-        model="meta-llama/Llama-3.3-70B-Instruct",   # You can update this to your specific model
         max_tokens=max_tokens,
-        stream=True,  # Stream the response
         temperature=temperature,
         top_p=top_p,
-        frequency_penalty=frequency_penalty,  # <-- NEW
-        seed=seed,                             # <-- NEW
         messages=messages,
     ):
-        # Extract the token text from the response chunk
         token_text = message_chunk.choices[0].delta.content
         print(f"Received token: {token_text}")
         response += token_text
         yield response
     print("Completed response generation.")
-# Create a Chatbot component with a specified height
-chatbot = gr.Chatbot(height=600)
-print("Chatbot interface created.")
-# Create the Gradio ChatInterface
-# We add two new sliders for Frequency Penalty and Seed
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="", label="System message"),
-        gr.Slider(minimum=1,   maximum=4096, value=512, step=1,   label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0,  value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0,  value=0.95, step=0.05, label="Top-P"),
-        gr.Slider(
-            minimum=-2.0,
-            maximum=2.0,
-            value=0.0,
-            step=0.1,
-            label="Frequency Penalty"
-        ),
-        gr.Slider(
-            minimum=-1,
-            maximum=65535,  # Arbitrary upper limit for demonstration
-            value=-1,
-            step=1,
-            label="Seed (-1 for random)"
-        ),
-    ],
-    fill_height=True,
-    chatbot=chatbot,
-    theme="Nymbo/Nymbo_Theme",
-)
-print("Gradio interface initialized.")
 if __name__ == "__main__":
     print("Launching the demo application.")
     demo.launch()

+import os
 import gradio as gr
 from openai import OpenAI
+# Load your Hugging Face Inference API token from environment
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
 print("Access token loaded.")
+# Initialize the OpenAI-like client that points to the HF Inference endpoint
 client = OpenAI(
     base_url="https://api-inference.huggingface.co/v1/",
     api_key=ACCESS_TOKEN,
     temperature,
     top_p,
     frequency_penalty,
+    seed,
+    featured_model,   # Selected from "Featured Models" radio
+    custom_model      # Optional user-provided custom model path
 ):
     """
+    Respond to user messages using the Hugging Face Inference API with OpenAI-like syntax.
+    Parameters:
+    - message (str): The latest user message
+    - history (list of tuples): The conversation history [(user_msg, assistant_msg), ...]
+    - system_message (str): System-level instruction or context
+    - max_tokens (int): Max tokens to generate
+    - temperature (float): Sampling temperature
+    - top_p (float): Nucleus sampling (top-p)
+    - frequency_penalty (float): Penalize repeated tokens
+    - seed (int): Fixed seed; if -1 => random
+    - featured_model (str): The featured model name selected in the UI
+    - custom_model (str): A custom model path (HF repo) provided by the user
+    """
     print(f"Received message: {message}")
     print(f"History: {history}")
     print(f"System message: {system_message}")
     print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
+    print(f"Featured Model (chosen): {featured_model}")
+    print(f"Custom Model (if any): {custom_model}")
+    # Decide which model to use. If the user typed a custom model, we use that.
+    # Otherwise, we use the featured model they picked from the radio.
+    if custom_model.strip():
+        model_to_use = custom_model.strip()
+    else:
+        model_to_use = featured_model
+    print(f"Final model to use: {model_to_use}")
+    # Convert seed to None if -1 => means random
     if seed == -1:
         seed = None
+    # Prepare the conversation
     messages = [{"role": "system", "content": system_message}]
     for val in history:
         user_part = val[0]
         assistant_part = val[1]
             messages.append({"role": "assistant", "content": assistant_part})
             print(f"Added assistant message to context: {assistant_part}")
+    # Add the latest user message
     messages.append({"role": "user", "content": message})
+    # Generate the response in a streaming manner
     response = ""
+    print("Sending request to HF Inference API via OpenAI-like client.")
     for message_chunk in client.chat.completions.create(
+        model=model_to_use,
         max_tokens=max_tokens,
+        stream=True,
         temperature=temperature,
         top_p=top_p,
+        frequency_penalty=frequency_penalty,
+        seed=seed,
         messages=messages,
     ):
         token_text = message_chunk.choices[0].delta.content
         print(f"Received token: {token_text}")
         response += token_text
+        # Yield partial responses to get streaming in Gradio
         yield response
     print("Completed response generation.")
+# ----------------------------
+# DEFINE THE GRADIO INTERFACE
+# ----------------------------
+def build_demo():
+    """
+    Build the entire Gradio Blocks interface, featuring:
+      - A Tab for the chatbot (with featured models, custom model)
+      - An Information tab with model table, parameter overview, etc.
+    """
+    # Define your placeholder featured models
+    featured_models_list = [
+        "meta-llama/Llama-3.3-70B-Instruct",
+        "Qwen/Qwen2.5-7B-Instruct",
+        "google/gemma-2-2b-it",
+        "microsoft/Phi-3-mini-4k-instruct",
+    ]
+    with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
+        gr.Markdown("## Serverless Text Generation Hub")
+        with gr.Tabs():
+            # -------------------- CHAT TAB --------------------
+            with gr.Tab("Chat"):
+                with gr.Row():
+                    with gr.Column():
+                        # "Featured Models" Accordion
+                        with gr.Accordion("Featured Models", open=False):
+                            model_search = gr.Textbox(
+                                label="Filter Featured Models",
+                                placeholder="Search featured models...",
+                                lines=1,
+                            )
+                            # Radio for selecting a featured model
+                            featured_models = gr.Radio(
+                                label="Pick a Featured Model",
+                                choices=featured_models_list,
+                                value=featured_models_list[0],
+                                interactive=True,
+                            )
+                            # Function to filter the model list by search text
+                            def filter_models(search_term):
+                                filtered = [
+                                    m
+                                    for m in featured_models_list
+                                    if search_term.lower() in m.lower()
+                                ]
+                                return gr.update(choices=filtered)
+                            # Update the radio choices when user enters text in the search box
+                            model_search.change(
+                                filter_models,
+                                inputs=model_search,
+                                outputs=featured_models,
+                            )
+                        # "Custom Model" text box
+                        custom_model = gr.Textbox(
+                            label="Custom Model",
+                            placeholder="Paste a Hugging Face repo path, e.g. 'myuser/my-model'",
+                            lines=1,
+                        )
+                        gr.Markdown(
+                            "If you provide a custom model path above, it will override your featured model selection."
+                        )
+                    with gr.Column():
+                        # Create the Gradio Chatbot
+                        chatbot = gr.Chatbot(height=600, label="Chat Output")
+                # Additional controls for system prompt & generation parameters
+                with gr.Box():
+                    system_message = gr.Textbox(
+                        value="",
+                        label="System message",
+                        placeholder="System-level instruction or context here...",
+                    )
+                    max_tokens = gr.Slider(
+                        minimum=1,
+                        maximum=4096,
+                        value=512,
+                        step=1,
+                        label="Max new tokens",
+                    )
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=4.0,
+                        value=0.7,
+                        step=0.1,
+                        label="Temperature",
+                    )
+                    top_p = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        value=0.95,
+                        step=0.05,
+                        label="Top-P",
+                    )
+                    frequency_penalty = gr.Slider(
+                        minimum=-2.0,
+                        maximum=2.0,
+                        value=0.0,
+                        step=0.1,
+                        label="Frequency Penalty",
+                    )
+                    seed = gr.Slider(
+                        minimum=-1,
+                        maximum=65535,
+                        value=-1,
+                        step=1,
+                        label="Seed (-1 for random)",
+                    )
+                # We will attach a ChatInterface-like set of controls manually.
+                # Keep track of conversation state
+                state = gr.State([])  # Holds conversation as a list of (user, assistant)
+                # Define "user" event function
+                def user_message(user_text, history):
+                    """
+                    When the user sends a message, add it to history as (user_text, "")
+                    The assistant's response will fill the second part of the tuple later.
+                    """
+                    if not user_text:
+                        return gr.update(), history
+                    new_history = history + [(user_text, "")]  # user question, empty answer
+                    return gr.update(value=""), new_history
+                # Define "bot" event function
+                def bot_message(history, system_message, max_tokens, temperature, top_p,
+                                frequency_penalty, seed, featured_models, custom_model):
+                    """
+                    Generate assistant reply given the entire chat history,
+                    system prompt, and generation params. The function will stream
+                    tokens from respond().
+                    """
+                    user_text = history[-1][0] if history else ""
+                    # We'll call respond() as a generator, so we can stream back tokens.
+                    bot_stream = respond(
+                        message=user_text,
+                        history=history[:-1],
+                        system_message=system_message,
+                        max_tokens=max_tokens,
+                        temperature=temperature,
+                        top_p=top_p,
+                        frequency_penalty=frequency_penalty,
+                        seed=seed,
+                        featured_model=featured_models,
+                        custom_model=custom_model,
+                    )
+                    # We'll build up the assistant's reply token by token
+                    final_assistant_text = ""
+                    for token in bot_stream:
+                        final_assistant_text = token
+                        # We yield partial updates to the chatbot
+                        yield history[:-1] + [(user_text, final_assistant_text)]
+                    # Once complete, update the conversation in state
+                    history[-1] = (user_text, final_assistant_text)
+                    yield history
+                # Textbox for the user to type a message
+                with gr.Row():
+                    with gr.Column(scale=8):
+                        user_textbox = gr.Textbox(
+                            label="Your message",
+                            placeholder="Type your question or prompt here...",
+                            lines=2,
+                            interactive=True,
+                        )
+                    with gr.Column(scale=2):
+                        send_button = gr.Button(
+                            value="Send",
+                            variant="primary"
+                        )
+                # When user clicks "Send", first call user_message(), then bot_message()
+                send_button.click(
+                    fn=user_message,
+                    inputs=[user_textbox, state],
+                    outputs=[user_textbox, state],
+                ).then(
+                    fn=bot_message,
+                    inputs=[
+                        state,
+                        system_message,
+                        max_tokens,
+                        temperature,
+                        top_p,
+                        frequency_penalty,
+                        seed,
+                        featured_models,
+                        custom_model,
+                    ],
+                    outputs=chatbot,
+                )
+            # -------------------- INFORMATION TAB --------------------
+            with gr.Tab("Information"):
+                # Put information about featured models
+                with gr.Accordion("Featured Models", open=False):
+                    gr.HTML(
+                        """
+                        <table style="width:100%; text-align:center; margin:auto;">
+                            <tr>
+                                <th>Model Name</th>
+                                <th>Description</th>
+                                <th>Status</th>
+                            </tr>
+                            <tr>
+                                <td>meta-llama/Llama-3.3-70B-Instruct</td>
+                                <td>Powerful large model by Llama, fine-tuned to follow instructions.</td>
+                                <td>✅</td>
+                            </tr>
+                            <tr>
+                                <td>Qwen/Qwen2.5-7B-Instruct</td>
+                                <td>Instruction-tuned LLM with good accuracy and speed.</td>
+                                <td>✅</td>
+                            </tr>
+                            <tr>
+                                <td>google/gemma-2-2b-it</td>
+                                <td>Compact 2B parameter model for quick text generation tasks.</td>
+                                <td>✅</td>
+                            </tr>
+                            <tr>
+                                <td>microsoft/Phi-3-mini-4k-instruct</td>
+                                <td>Small but effective model, optimized for instruction-based tasks.</td>
+                                <td>✅</td>
+                            </tr>
+                        </table>
+                        """
+                    )
+                # Put general parameter info
+                with gr.Accordion("Parameters Overview", open=False):
+                    gr.Markdown(
+                        """
+                        ## Parameters Overview
+                        - **System Message**
+                          This is a special prompt that sets the behavior or context for the AI.
+                        - **Max New Tokens**
+                          The maximum length of the AI's reply in tokens.
+                        - **Temperature**
+                          Controls how random or "creative" the model is. A higher value yields more unexpected outputs.
+                        - **Top-P**
+                          Nucleus sampling — only the tokens whose probabilities add up to `top_p` or higher are kept for generation.
+                        - **Frequency Penalty**
+                          Discourages the model from repeating tokens that already appeared.
+                        - **Seed**
+                          For reproducible outputs. If set to `-1`, a random seed is chosen each time.
+                        ### Model Selection
+                        - **Featured Models**
+                          A curated set of recommended or widely-used LLMs you can pick from.
+                        - **Custom Model**
+                          If you have a specific Hugging Face repo (e.g. `some-user/my-cool-model`), paste it here to override.
+                        ***
+                        Feel free to experiment with different settings to see how they affect the response!
+                        """
+                    )
+    return demo
+# Actually build and launch the app
 if __name__ == "__main__":
     print("Launching the demo application.")
+    demo = build_demo()
     demo.launch()