Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Nymbo commited on Jan 4

Commit

4c18bfc

verified ·

1 Parent(s): a430d0d

reverting

Browse files

Files changed (1) hide show

app.py +61 -310

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
-import os
 import gradio as gr
 from openai import OpenAI
-# Load your Hugging Face Inference API token from environment
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
 print("Access token loaded.")
-# Initialize the OpenAI-like client that points to the HF Inference endpoint
 client = OpenAI(
     base_url="https://api-inference.huggingface.co/v1/",
     api_key=ACCESS_TOKEN,
@@ -21,48 +21,34 @@ def respond(
     temperature,
     top_p,
     frequency_penalty,
-    seed,
-    featured_model,   # Selected from "Featured Models" radio
-    custom_model      # Optional user-provided custom model path
 ):
     """
-    Respond to user messages using the Hugging Face Inference API with OpenAI-like syntax.
-    Parameters:
-    - message (str): The latest user message
-    - history (list of tuples): The conversation history [(user_msg, assistant_msg), ...]
-    - system_message (str): System-level instruction or context
-    - max_tokens (int): Max tokens to generate
-    - temperature (float): Sampling temperature
-    - top_p (float): Nucleus sampling (top-p)
-    - frequency_penalty (float): Penalize repeated tokens
-    - seed (int): Fixed seed; if -1 => random
-    - featured_model (str): The featured model name selected in the UI
-    - custom_model (str): A custom model path (HF repo) provided by the user
     """
     print(f"Received message: {message}")
     print(f"History: {history}")
     print(f"System message: {system_message}")
     print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
-    print(f"Featured Model (chosen): {featured_model}")
-    print(f"Custom Model (if any): {custom_model}")
-    # Decide which model to use. If the user typed a custom model, we use that.
-    # Otherwise, we use the featured model they picked from the radio.
-    if custom_model.strip():
-        model_to_use = custom_model.strip()
-    else:
-        model_to_use = featured_model
-    print(f"Final model to use: {model_to_use}")
-    # Convert seed to None if -1 => means random
     if seed == -1:
         seed = None
-    # Prepare the conversation
     messages = [{"role": "system", "content": system_message}]
     for val in history:
         user_part = val[0]
         assistant_part = val[1]
@@ -73,301 +59,66 @@ def respond(
             messages.append({"role": "assistant", "content": assistant_part})
             print(f"Added assistant message to context: {assistant_part}")
-    # Add the latest user message
     messages.append({"role": "user", "content": message})
-    # Generate the response in a streaming manner
     response = ""
-    print("Sending request to HF Inference API via OpenAI-like client.")
     for message_chunk in client.chat.completions.create(
-        model=model_to_use,
         max_tokens=max_tokens,
-        stream=True,
         temperature=temperature,
         top_p=top_p,
-        frequency_penalty=frequency_penalty,
-        seed=seed,
         messages=messages,
     ):
         token_text = message_chunk.choices[0].delta.content
         print(f"Received token: {token_text}")
         response += token_text
-        # Yield partial responses to get streaming in Gradio
         yield response
     print("Completed response generation.")
-# ----------------------------
-# DEFINE THE GRADIO INTERFACE
-# ----------------------------
-def build_demo():
-    """
-    Build the entire Gradio Blocks interface, featuring:
-      - A Tab for the chatbot (with featured models, custom model)
-      - An Information tab with model table, parameter overview, etc.
-    """
-    # Define your placeholder featured models
-    featured_models_list = [
-        "meta-llama/Llama-3.3-70B-Instruct",
-        "Qwen/Qwen2.5-7B-Instruct",
-        "google/gemma-2-2b-it",
-        "microsoft/Phi-3-mini-4k-instruct",
-    ]
-    with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
-        gr.Markdown("## Serverless Text Generation Hub")
-        with gr.Tabs():
-            # -------------------- CHAT TAB --------------------
-            with gr.Tab("Chat"):
-                with gr.Row():
-                    with gr.Column():
-                        # "Featured Models" Accordion
-                        with gr.Accordion("Featured Models", open=False):
-                            model_search = gr.Textbox(
-                                label="Filter Featured Models",
-                                placeholder="Search featured models...",
-                                lines=1,
-                            )
-                            # Radio for selecting a featured model
-                            featured_models = gr.Radio(
-                                label="Pick a Featured Model",
-                                choices=featured_models_list,
-                                value=featured_models_list[0],
-                                interactive=True,
-                            )
-                            # Function to filter the model list by search text
-                            def filter_models(search_term):
-                                filtered = [
-                                    m
-                                    for m in featured_models_list
-                                    if search_term.lower() in m.lower()
-                                ]
-                                return gr.update(choices=filtered)
-                            # Update the radio choices when user enters text in the search box
-                            model_search.change(
-                                filter_models,
-                                inputs=model_search,
-                                outputs=featured_models,
-                            )
-                        # "Custom Model" text box
-                        custom_model = gr.Textbox(
-                            label="Custom Model",
-                            placeholder="Paste a Hugging Face repo path, e.g. 'myuser/my-model'",
-                            lines=1,
-                        )
-                        gr.Markdown(
-                            "If you provide a custom model path above, it will override your featured model selection."
-                        )
-                    with gr.Column():
-                        # Create the Gradio Chatbot
-                        chatbot = gr.Chatbot(height=600, label="Chat Output")
-                # Additional controls for system prompt & generation parameters
-                with gr.Box():
-                    system_message = gr.Textbox(
-                        value="",
-                        label="System message",
-                        placeholder="System-level instruction or context here...",
-                    )
-                    max_tokens = gr.Slider(
-                        minimum=1,
-                        maximum=4096,
-                        value=512,
-                        step=1,
-                        label="Max new tokens",
-                    )
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=4.0,
-                        value=0.7,
-                        step=0.1,
-                        label="Temperature",
-                    )
-                    top_p = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.95,
-                        step=0.05,
-                        label="Top-P",
-                    )
-                    frequency_penalty = gr.Slider(
-                        minimum=-2.0,
-                        maximum=2.0,
-                        value=0.0,
-                        step=0.1,
-                        label="Frequency Penalty",
-                    )
-                    seed = gr.Slider(
-                        minimum=-1,
-                        maximum=65535,
-                        value=-1,
-                        step=1,
-                        label="Seed (-1 for random)",
-                    )
-                # We will attach a ChatInterface-like set of controls manually.
-                # Keep track of conversation state
-                state = gr.State([])  # Holds conversation as a list of (user, assistant)
-                # Define "user" event function
-                def user_message(user_text, history):
-                    """
-                    When the user sends a message, add it to history as (user_text, "")
-                    The assistant's response will fill the second part of the tuple later.
-                    """
-                    if not user_text:
-                        return gr.update(), history
-                    new_history = history + [(user_text, "")]  # user question, empty answer
-                    return gr.update(value=""), new_history
-                # Define "bot" event function
-                def bot_message(history, system_message, max_tokens, temperature, top_p,
-                                frequency_penalty, seed, featured_models, custom_model):
-                    """
-                    Generate assistant reply given the entire chat history,
-                    system prompt, and generation params. The function will stream
-                    tokens from respond().
-                    """
-                    user_text = history[-1][0] if history else ""
-                    # We'll call respond() as a generator, so we can stream back tokens.
-                    bot_stream = respond(
-                        message=user_text,
-                        history=history[:-1],
-                        system_message=system_message,
-                        max_tokens=max_tokens,
-                        temperature=temperature,
-                        top_p=top_p,
-                        frequency_penalty=frequency_penalty,
-                        seed=seed,
-                        featured_model=featured_models,
-                        custom_model=custom_model,
-                    )
-                    # We'll build up the assistant's reply token by token
-                    final_assistant_text = ""
-                    for token in bot_stream:
-                        final_assistant_text = token
-                        # We yield partial updates to the chatbot
-                        yield history[:-1] + [(user_text, final_assistant_text)]
-                    # Once complete, update the conversation in state
-                    history[-1] = (user_text, final_assistant_text)
-                    yield history
-                # Textbox for the user to type a message
-                with gr.Row():
-                    with gr.Column(scale=8):
-                        user_textbox = gr.Textbox(
-                            label="Your message",
-                            placeholder="Type your question or prompt here...",
-                            lines=2,
-                            interactive=True,
-                        )
-                    with gr.Column(scale=2):
-                        send_button = gr.Button(
-                            value="Send",
-                            variant="primary"
-                        )
-                # When user clicks "Send", first call user_message(), then bot_message()
-                send_button.click(
-                    fn=user_message,
-                    inputs=[user_textbox, state],
-                    outputs=[user_textbox, state],
-                ).then(
-                    fn=bot_message,
-                    inputs=[
-                        state,
-                        system_message,
-                        max_tokens,
-                        temperature,
-                        top_p,
-                        frequency_penalty,
-                        seed,
-                        featured_models,
-                        custom_model,
-                    ],
-                    outputs=chatbot,
-                )
-            # -------------------- INFORMATION TAB --------------------
-            with gr.Tab("Information"):
-                # Put information about featured models
-                with gr.Accordion("Featured Models", open=False):
-                    gr.HTML(
-                        """
-                        <table style="width:100%; text-align:center; margin:auto;">
-                            <tr>
-                                <th>Model Name</th>
-                                <th>Description</th>
-                                <th>Status</th>
-                            </tr>
-                            <tr>
-                                <td>meta-llama/Llama-3.3-70B-Instruct</td>
-                                <td>Powerful large model by Llama, fine-tuned to follow instructions.</td>
-                                <td>✅</td>
-                            </tr>
-                            <tr>
-                                <td>Qwen/Qwen2.5-7B-Instruct</td>
-                                <td>Instruction-tuned LLM with good accuracy and speed.</td>
-                                <td>✅</td>
-                            </tr>
-                            <tr>
-                                <td>google/gemma-2-2b-it</td>
-                                <td>Compact 2B parameter model for quick text generation tasks.</td>
-                                <td>✅</td>
-                            </tr>
-                            <tr>
-                                <td>microsoft/Phi-3-mini-4k-instruct</td>
-                                <td>Small but effective model, optimized for instruction-based tasks.</td>
-                                <td>✅</td>
-                            </tr>
-                        </table>
-                        """
-                    )
-                # Put general parameter info
-                with gr.Accordion("Parameters Overview", open=False):
-                    gr.Markdown(
-                        """
-                        ## Parameters Overview
-                        - **System Message**
-                          This is a special prompt that sets the behavior or context for the AI.
-                        - **Max New Tokens**
-                          The maximum length of the AI's reply in tokens.
-                        - **Temperature**
-                          Controls how random or "creative" the model is. A higher value yields more unexpected outputs.
-                        - **Top-P**
-                          Nucleus sampling — only the tokens whose probabilities add up to `top_p` or higher are kept for generation.
-                        - **Frequency Penalty**
-                          Discourages the model from repeating tokens that already appeared.
-                        - **Seed**
-                          For reproducible outputs. If set to `-1`, a random seed is chosen each time.
-                        ### Model Selection
-                        - **Featured Models**
-                          A curated set of recommended or widely-used LLMs you can pick from.
-                        - **Custom Model**
-                          If you have a specific Hugging Face repo (e.g. `some-user/my-cool-model`), paste it here to override.
-                        ***
-                        Feel free to experiment with different settings to see how they affect the response!
-                        """
-                    )
-    return demo
-# Actually build and launch the app
 if __name__ == "__main__":
     print("Launching the demo application.")
-    demo = build_demo()
     demo.launch()

 import gradio as gr
 from openai import OpenAI
+import os
+# Retrieve the access token from the environment variable
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
 print("Access token loaded.")
+# Initialize the OpenAI client with the Hugging Face Inference API endpoint
 client = OpenAI(
     base_url="https://api-inference.huggingface.co/v1/",
     api_key=ACCESS_TOKEN,
     temperature,
     top_p,
     frequency_penalty,
+    seed
 ):
     """
+    This function handles the chatbot response. It takes in:
+    - message: the user's new message
+    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
+    - system_message: the system prompt
+    - max_tokens: the maximum number of tokens to generate in the response
+    - temperature: sampling temperature
+    - top_p: top-p (nucleus) sampling
+    - frequency_penalty: penalize repeated tokens in the output
+    - seed: a fixed seed for reproducibility; -1 will mean 'random'
     """
     print(f"Received message: {message}")
     print(f"History: {history}")
     print(f"System message: {system_message}")
     print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
+    # Convert seed to None if -1 (meaning random)
     if seed == -1:
         seed = None
+    # Construct the messages array required by the API
     messages = [{"role": "system", "content": system_message}]
+    # Add conversation history to the context
     for val in history:
         user_part = val[0]
         assistant_part = val[1]
             messages.append({"role": "assistant", "content": assistant_part})
             print(f"Added assistant message to context: {assistant_part}")
+    # Append the latest user message
     messages.append({"role": "user", "content": message})
+    # Start with an empty string to build the response as tokens stream in
     response = ""
+    print("Sending request to OpenAI API.")
+    # Make the streaming request to the HF Inference API via openai-like client
     for message_chunk in client.chat.completions.create(
+        model="meta-llama/Llama-3.3-70B-Instruct",   # You can update this to your specific model
         max_tokens=max_tokens,
+        stream=True,  # Stream the response
         temperature=temperature,
         top_p=top_p,
+        frequency_penalty=frequency_penalty,  # <-- NEW
+        seed=seed,                             # <-- NEW
         messages=messages,
     ):
+        # Extract the token text from the response chunk
         token_text = message_chunk.choices[0].delta.content
         print(f"Received token: {token_text}")
         response += token_text
         yield response
     print("Completed response generation.")
+# Create a Chatbot component with a specified height
+chatbot = gr.Chatbot(height=600)
+print("Chatbot interface created.")
+# Create the Gradio ChatInterface
+# We add two new sliders for Frequency Penalty and Seed
+demo = gr.ChatInterface(
+    respond,
+    additional_inputs=[
+        gr.Textbox(value="", label="System message"),
+        gr.Slider(minimum=1,   maximum=4096, value=512, step=1,   label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=4.0,  value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(minimum=0.1, maximum=1.0,  value=0.95, step=0.05, label="Top-P"),
+        gr.Slider(
+            minimum=-2.0,
+            maximum=2.0,
+            value=0.0,
+            step=0.1,
+            label="Frequency Penalty"
+        ),
+        gr.Slider(
+            minimum=-1,
+            maximum=65535,  # Arbitrary upper limit for demonstration
+            value=-1,
+            step=1,
+            label="Seed (-1 for random)"
+        ),
+    ],
+    fill_height=True,
+    chatbot=chatbot,
+    theme="Nymbo/Nymbo_Theme",
+)
+print("Gradio interface initialized.")
 if __name__ == "__main__":
     print("Launching the demo application.")
     demo.launch()