Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 12,414 Bytes

import gradio as gr
from openai import OpenAI
import os

# --------------------------------------------------------------------------------
#  Serverless-TextGen-Hub
#  This application is a Gradio-based UI for text generation using
#  Hugging Face's serverless Inference API. We also incorporate features
#  inspired by the ImgGen-Hub, such as:
#    - A "Featured Models" accordion with text filtering.
#    - A "Custom Model" textbox for specifying a non-featured model.
#    - An "Information" tab with accordions for "Featured Models" and 
#      "Parameters Overview" containing helpful user guides.
# --------------------------------------------------------------------------------

# Retrieve the access token from environment variables
ACCESS_TOKEN = os.getenv("HF_TOKEN")  # HF_TOKEN is your Hugging Face Inference API key
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    # NEW inputs for model selection
    model_search,
    selected_model,
    custom_model
):
    """
    This function handles the chatbot response.

    Parameters:
    - message: The user's newest message (string).
    - history: The list of previous messages in the conversation, each as a tuple (user_msg, assistant_msg).
    - system_message: The system prompt provided.
    - max_tokens: The maximum number of tokens to generate in the response.
    - temperature: Sampling temperature (float).
    - top_p: Top-p (nucleus) sampling (float).
    - frequency_penalty: Penalize repeated tokens in the output (float).
    - seed: A fixed seed for reproducibility; -1 means 'random'.
    - model_search: The text used to filter the "Featured Models" Radio button list (unused here directly, but updated by the UI).
    - selected_model: The model selected via the "Featured Models" Radio button.
    - custom_model: If not empty, overrides selected_model with this custom path.
    """

    # DEBUG LOGGING
    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Model search text: {model_search}")
    print(f"Selected featured model: {selected_model}")
    print(f"Custom model (overrides if not empty): {custom_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Determine the final model name to use
    # If the custom_model textbox is non-empty, we use that.
    # Otherwise, we use the selected model from the Radio buttons.
    if custom_model.strip():
        model_to_use = custom_model.strip()
    else:
        model_to_use = selected_model

    # Construct the messages array required by the OpenAI-like HF API
    messages = [{"role": "system", "content": system_message}]  # System prompt
    # Add conversation history to context
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})

    # Append the latest user message
    messages.append({"role": "user", "content": message})

    # Start with an empty string to build the response as tokens stream in
    response = ""
    print(f"Using model: {model_to_use}")
    print("Sending request to OpenAI API...")

    # Make the streaming request to the HF Inference API via openai-like client
    # Below, we pass 'model_to_use' instead of a hard-coded model
    for message_chunk in client.chat.completions.create(
        model=model_to_use,             # <-- model is now dynamically selected
        max_tokens=max_tokens,
        stream=True,                    # Stream the response
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        # Extract token text from the response chunk
        token_text = message_chunk.choices[0].delta.content
        response += token_text
        # As we get new tokens, we stream them back to the user
        yield response

    print("Completed response generation.")

# Create a Chatbot component with a specified height
chatbot = gr.Chatbot(height=600)

# ------------------------------------------------------------
# Below: We define the UI with additional features integrated.
# We'll replicate some of the style from the ImgGen-Hub code:
#  - A "Featured Models" accordion with the ability to filter
#  - A "Custom Model" text box
#  - An "Information" tab with "Featured Models" table and 
#    "Parameters Overview" containing markdown descriptions.
# ------------------------------------------------------------

# List of placeholder "Featured Models" for demonstration
featured_models_list = [
    "meta-llama/Llama-3.3-70B-Instruct",
    "meta-llama/Llama-2-70B-chat-hf",
    "meta-llama/Llama-2-13B-chat-hf",
    "bigscience/bloom",
    "google/flan-t5-xxl",
]

# This function filters the models in featured_models_list based on user input
def filter_models(search_term):
    """
    Filters featured_models_list based on the text in 'search_term'.
    """
    filtered = [m for m in featured_models_list if search_term.lower() in m.lower()]
    return gr.update(choices=filtered)

print("Initializing Gradio interface...")  # Debug log

# We build a custom Blocks layout to incorporate tabs and advanced UI elements
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:

    # Top-level heading for clarity
    gr.Markdown("# Serverless-TextGen-Hub\nA Comprehensive UI for Text Generation")

    with gr.Tab("Chat"):
        # We'll place the ChatInterface within this tab

        # Create the additional UI elements in a collapsible or visible layout
        with gr.Accordion("Featured Models", open=False):
            with gr.Row():
                model_search = gr.Textbox(
                    label="Filter Models", 
                    placeholder="Search for a featured model...",
                    lines=1,
                )
            with gr.Row():
                model_radio = gr.Radio(
                    label="Select a featured model below",
                    choices=featured_models_list,
                    value="meta-llama/Llama-3.3-70B-Instruct",
                    interactive=True,
                )
            # On change of model_search, we update the radio choices
            model_search.change(
                filter_models,
                inputs=model_search,
                outputs=model_radio
            )

        # Textbox for specifying a custom model that overrides the featured selection if not empty
        custom_model = gr.Textbox(
            label="Custom Model Path (overrides Featured Models if not empty)",
            placeholder="e.g. meta-llama/Llama-2-13B-chat-hf",
            lines=1
        )

        # Build the chat interface itself
        # We'll pass "model_search", "model_radio", and "custom_model" as additional inputs
        # so that the 'respond' function can see them and decide which model to use
        chatbot_interface = gr.ChatInterface(
            fn=respond,  # The function that generates the text
            additional_inputs=[
                gr.Textbox(
                    value="You are a helpful AI assistant.",
                    label="System message",
                    lines=2
                ),  # system_message
                gr.Slider(minimum=1,   maximum=4096, value=512,  step=1,   label="Max new tokens"),  # max_tokens
                gr.Slider(minimum=0.1, maximum=4.0,   value=0.7,  step=0.1, label="Temperature"),      # temperature
                gr.Slider(minimum=0.1, maximum=1.0,   value=0.95, step=0.05,label="Top-P"),           # top_p
                gr.Slider(
                    minimum=-2.0,
                    maximum=2.0,
                    value=0.0,
                    step=0.1,
                    label="Frequency Penalty"
                ),  # frequency_penalty
                gr.Slider(
                    minimum=-1,
                    maximum=65535,
                    value=-1,
                    step=1,
                    label="Seed (-1 for random)"
                ),  # seed
                model_search,  # Exposed but won't be typed into during conversation,
                model_radio,
                custom_model
            ],
            chatbot=chatbot,
            title="Serverless-TextGen-Hub",
            # The fill_height ensures the chat area expands
            fill_height=True
        )

    # A new tab for "Information" about Featured Models and Parameters
    with gr.Tab("Information"):
        gr.Markdown("## Learn More About the Parameters and Models")
        
        # Accordion for "Featured Models"
        with gr.Accordion("Featured Models (WiP)", open=False):
            gr.HTML(
                """
                <p>Below is a small table of example models. In practice, you can pick from 
                thousands of available text generation models on Hugging Face. 
                <br>
                Use the <b>Filter Models</b> box under the <b>Featured Models</b> accordion 
                in the Chat tab to search by name, or enter a <b>Custom Model</b> path.</p>
                <table style="width:100%; text-align:center; margin:auto;">
                    <tr>
                        <th>Model Name</th>
                        <th>Is It Large?</th>
                        <th>Notes</th>
                    </tr>
                    <tr>
                        <td>meta-llama/Llama-3.3-70B-Instruct</td>
                        <td>Yes</td>
                        <td>Placeholder example</td>
                    </tr>
                    <tr>
                        <td>meta-llama/Llama-2-13B-chat-hf</td>
                        <td>Medium</td>
                        <td>Placeholder example</td>
                    </tr>
                    <tr>
                        <td>google/flan-t5-xxl</td>
                        <td>Yes</td>
                        <td>Placeholder example</td>
                    </tr>
                </table>
                """
            )

        # Accordion for "Parameters Overview"
        with gr.Accordion("Parameters Overview", open=False):
            gr.Markdown(
                """
                ### Max New Tokens
                Controls how many tokens can be generated in the response. A token is roughly a word or a piece of a word. If you need longer answers, increase this.

                ### Temperature
                A higher temperature makes the AI more 'creative' and random in its responses. Lower temperature keeps it more focused and deterministic.

                ### Top-P
                This is 'nucleus sampling.' It dictates the proportion of probability mass the model considers. At 1.0, it considers all words. Lower it to focus on the most likely words.

                ### Frequency Penalty
                Penalizes repeated tokens in the output. If you see a lot of repetition, increase this slightly to reduce the repetition.

                ### Seed
                If set to -1, the randomness is different each time. Setting a specific number ensures the same result each run, making responses reproducible.

                ### Custom Model
                If this field is filled, it overrides the selection from Featured Models. This way, you can try out any model on the HF Hub, e.g. 
                <code>meta-llama/Llama-2-70B-chat-hf</code> or <code>bigscience/bloom</code>.
                """
            )

print("Gradio interface initialized.")

# ------------------------------------------------------------
# Finally, we launch the app if the script is run directly.
# ------------------------------------------------------------
if __name__ == "__main__":
    print("Launching the demo application...")
    demo.launch()