Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 15,956 Bytes

import os
import gradio as gr
from openai import OpenAI

# Load your Hugging Face Inference API token from environment
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI-like client that points to the HF Inference endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    featured_model,   # Selected from "Featured Models" radio
    custom_model      # Optional user-provided custom model path
):
    """
    Respond to user messages using the Hugging Face Inference API with OpenAI-like syntax.

    Parameters:
    - message (str): The latest user message
    - history (list of tuples): The conversation history [(user_msg, assistant_msg), ...]
    - system_message (str): System-level instruction or context
    - max_tokens (int): Max tokens to generate
    - temperature (float): Sampling temperature
    - top_p (float): Nucleus sampling (top-p)
    - frequency_penalty (float): Penalize repeated tokens
    - seed (int): Fixed seed; if -1 => random
    - featured_model (str): The featured model name selected in the UI
    - custom_model (str): A custom model path (HF repo) provided by the user
    """
    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Featured Model (chosen): {featured_model}")
    print(f"Custom Model (if any): {custom_model}")

    # Decide which model to use. If the user typed a custom model, we use that.
    # Otherwise, we use the featured model they picked from the radio.
    if custom_model.strip():
        model_to_use = custom_model.strip()
    else:
        model_to_use = featured_model

    print(f"Final model to use: {model_to_use}")

    # Convert seed to None if -1 => means random
    if seed == -1:
        seed = None

    # Prepare the conversation
    messages = [{"role": "system", "content": system_message}]
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Add the latest user message
    messages.append({"role": "user", "content": message})

    # Generate the response in a streaming manner
    response = ""
    print("Sending request to HF Inference API via OpenAI-like client.")
    for message_chunk in client.chat.completions.create(
        model=model_to_use,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response += token_text
        # Yield partial responses to get streaming in Gradio
        yield response

    print("Completed response generation.")


# ----------------------------
# DEFINE THE GRADIO INTERFACE
# ----------------------------
def build_demo():
    """
    Build the entire Gradio Blocks interface, featuring:
      - A Tab for the chatbot (with featured models, custom model)
      - An Information tab with model table, parameter overview, etc.
    """
    # Define your placeholder featured models
    featured_models_list = [
        "meta-llama/Llama-3.3-70B-Instruct",
        "Qwen/Qwen2.5-7B-Instruct",
        "google/gemma-2-2b-it",
        "microsoft/Phi-3-mini-4k-instruct",
    ]

    with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
        gr.Markdown("## Serverless Text Generation Hub")

        with gr.Tabs():
            # -------------------- CHAT TAB --------------------
            with gr.Tab("Chat"):
                with gr.Row():
                    with gr.Column():
                        # "Featured Models" Accordion
                        with gr.Accordion("Featured Models", open=False):
                            model_search = gr.Textbox(
                                label="Filter Featured Models",
                                placeholder="Search featured models...",
                                lines=1,
                            )
                            # Radio for selecting a featured model
                            featured_models = gr.Radio(
                                label="Pick a Featured Model",
                                choices=featured_models_list,
                                value=featured_models_list[0],
                                interactive=True,
                            )

                            # Function to filter the model list by search text
                            def filter_models(search_term):
                                filtered = [
                                    m
                                    for m in featured_models_list
                                    if search_term.lower() in m.lower()
                                ]
                                return gr.update(choices=filtered)

                            # Update the radio choices when user enters text in the search box
                            model_search.change(
                                filter_models,
                                inputs=model_search,
                                outputs=featured_models,
                            )

                        # "Custom Model" text box
                        custom_model = gr.Textbox(
                            label="Custom Model",
                            placeholder="Paste a Hugging Face repo path, e.g. 'myuser/my-model'",
                            lines=1,
                        )
                        gr.Markdown(
                            "If you provide a custom model path above, it will override your featured model selection."
                        )

                    with gr.Column():
                        # Create the Gradio Chatbot
                        chatbot = gr.Chatbot(height=600, label="Chat Output")

                # Additional controls for system prompt & generation parameters
                with gr.Box():
                    system_message = gr.Textbox(
                        value="",
                        label="System message",
                        placeholder="System-level instruction or context here...",
                    )
                    max_tokens = gr.Slider(
                        minimum=1,
                        maximum=4096,
                        value=512,
                        step=1,
                        label="Max new tokens",
                    )
                    temperature = gr.Slider(
                        minimum=0.1,
                        maximum=4.0,
                        value=0.7,
                        step=0.1,
                        label="Temperature",
                    )
                    top_p = gr.Slider(
                        minimum=0.1,
                        maximum=1.0,
                        value=0.95,
                        step=0.05,
                        label="Top-P",
                    )
                    frequency_penalty = gr.Slider(
                        minimum=-2.0,
                        maximum=2.0,
                        value=0.0,
                        step=0.1,
                        label="Frequency Penalty",
                    )
                    seed = gr.Slider(
                        minimum=-1,
                        maximum=65535,
                        value=-1,
                        step=1,
                        label="Seed (-1 for random)",
                    )

                # We will attach a ChatInterface-like set of controls manually.
                # Keep track of conversation state
                state = gr.State([])  # Holds conversation as a list of (user, assistant)

                # Define "user" event function
                def user_message(user_text, history):
                    """
                    When the user sends a message, add it to history as (user_text, "")
                    The assistant's response will fill the second part of the tuple later.
                    """
                    if not user_text:
                        return gr.update(), history
                    new_history = history + [(user_text, "")]  # user question, empty answer
                    return gr.update(value=""), new_history

                # Define "bot" event function
                def bot_message(history, system_message, max_tokens, temperature, top_p,
                                frequency_penalty, seed, featured_models, custom_model):
                    """
                    Generate assistant reply given the entire chat history,
                    system prompt, and generation params. The function will stream
                    tokens from respond().
                    """
                    user_text = history[-1][0] if history else ""
                    # We'll call respond() as a generator, so we can stream back tokens.
                    bot_stream = respond(
                        message=user_text,
                        history=history[:-1],
                        system_message=system_message,
                        max_tokens=max_tokens,
                        temperature=temperature,
                        top_p=top_p,
                        frequency_penalty=frequency_penalty,
                        seed=seed,
                        featured_model=featured_models,
                        custom_model=custom_model,
                    )
                    # We'll build up the assistant's reply token by token
                    final_assistant_text = ""
                    for token in bot_stream:
                        final_assistant_text = token
                        # We yield partial updates to the chatbot
                        yield history[:-1] + [(user_text, final_assistant_text)]
                    # Once complete, update the conversation in state
                    history[-1] = (user_text, final_assistant_text)
                    yield history

                # Textbox for the user to type a message
                with gr.Row():
                    with gr.Column(scale=8):
                        user_textbox = gr.Textbox(
                            label="Your message",
                            placeholder="Type your question or prompt here...",
                            lines=2,
                            interactive=True,
                        )
                    with gr.Column(scale=2):
                        send_button = gr.Button(
                            value="Send",
                            variant="primary"
                        )

                # When user clicks "Send", first call user_message(), then bot_message()
                send_button.click(
                    fn=user_message,
                    inputs=[user_textbox, state],
                    outputs=[user_textbox, state],
                ).then(
                    fn=bot_message,
                    inputs=[
                        state,
                        system_message,
                        max_tokens,
                        temperature,
                        top_p,
                        frequency_penalty,
                        seed,
                        featured_models,
                        custom_model,
                    ],
                    outputs=chatbot,
                )

            # -------------------- INFORMATION TAB --------------------
            with gr.Tab("Information"):
                # Put information about featured models
                with gr.Accordion("Featured Models", open=False):
                    gr.HTML(
                        """
                        <table style="width:100%; text-align:center; margin:auto;">
                            <tr>
                                <th>Model Name</th>
                                <th>Description</th>
                                <th>Status</th>
                            </tr>
                            <tr>
                                <td>meta-llama/Llama-3.3-70B-Instruct</td>
                                <td>Powerful large model by Llama, fine-tuned to follow instructions.</td>
                                <td>✅</td>
                            </tr>
                            <tr>
                                <td>Qwen/Qwen2.5-7B-Instruct</td>
                                <td>Instruction-tuned LLM with good accuracy and speed.</td>
                                <td>✅</td>
                            </tr>
                            <tr>
                                <td>google/gemma-2-2b-it</td>
                                <td>Compact 2B parameter model for quick text generation tasks.</td>
                                <td>✅</td>
                            </tr>
                            <tr>
                                <td>microsoft/Phi-3-mini-4k-instruct</td>
                                <td>Small but effective model, optimized for instruction-based tasks.</td>
                                <td>✅</td>
                            </tr>
                        </table>
                        """
                    )

                # Put general parameter info
                with gr.Accordion("Parameters Overview", open=False):
                    gr.Markdown(
                        """
                        ## Parameters Overview
                        - **System Message**  
                          This is a special prompt that sets the behavior or context for the AI.  
                        
                        - **Max New Tokens**  
                          The maximum length of the AI's reply in tokens.  
                        
                        - **Temperature**  
                          Controls how random or "creative" the model is. A higher value yields more unexpected outputs.  
                        
                        - **Top-P**  
                          Nucleus sampling — only the tokens whose probabilities add up to `top_p` or higher are kept for generation.  
                        
                        - **Frequency Penalty**  
                          Discourages the model from repeating tokens that already appeared.  
                        
                        - **Seed**  
                          For reproducible outputs. If set to `-1`, a random seed is chosen each time.  
                        
                        ### Model Selection
                        - **Featured Models**  
                          A curated set of recommended or widely-used LLMs you can pick from.  
                        - **Custom Model**  
                          If you have a specific Hugging Face repo (e.g. `some-user/my-cool-model`), paste it here to override.  
                        
                        ***
                        Feel free to experiment with different settings to see how they affect the response!
                        """
                    )

    return demo

# Actually build and launch the app
if __name__ == "__main__":
    print("Launching the demo application.")
    demo = build_demo()
    demo.launch()