Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 8,292 Bytes

import gradio as gr
from openai import OpenAI
import os

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    selected_model,
):
    """
    This function handles the chatbot response. It takes in:
    - message: the user's new message
    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
    - system_message: the system prompt
    - max_tokens: the maximum number of tokens to generate in the response
    - temperature: sampling temperature
    - top_p: top-p (nucleus) sampling
    - frequency_penalty: penalize repeated tokens in the output
    - seed: a fixed seed for reproducibility; -1 will mean 'random'
    - selected_model: the model to use for generating the response
    """

    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Selected model: {selected_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Construct the messages array required by the API
    messages = [{"role": "system", "content": system_message}]

    # Add conversation history to the context
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})

    # Start with an empty string to build the response as tokens stream in
    response = ""
    print("Sending request to OpenAI API.")

    # Make the streaming request to the HF Inference API via openai-like client
    for message_chunk in client.chat.completions.create(
        model=selected_model,  # Use the selected model
        max_tokens=max_tokens,
        stream=True,  # Stream the response
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,  # <-- NEW
        seed=seed,  # <-- NEW
        messages=messages,
    ):
        # Extract the token text from the response chunk
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response += token_text
        yield response

    print("Completed response generation.")

# Create a Chatbot component with a specified height
chatbot = gr.Chatbot(height=600)
print("Chatbot interface created.")

# Define the list of featured models
featured_models = [
    "meta-llama/Llama-3.3-70B-Instruct",
    "google/flan-t5-xl",
    "facebook/bart-large-cnn",
    "EleutherAI/gpt-neo-2.7B",
    # Add more featured models here
]

# Create the Gradio Blocks interface
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    # Tab for model selection
    with gr.Tab("Models"):
        with gr.Row():
            with gr.Column():
                with gr.Accordion("Featured Models", open=True):
                    model_search = gr.Textbox(label="Filter Models", placeholder="Search for a featured model...", lines=1)
                    model = gr.Dropdown(label="Select a model below", choices=featured_models, value="meta-llama/Llama-3.3-70B-Instruct", interactive=True)

                    def filter_models(search_term):
                        filtered_models = [m for m in featured_models if search_term.lower() in m.lower()]
                        return gr.update(choices=filtered_models)

                    model_search.change(filter_models, inputs=model_search, outputs=model)

                custom_model = gr.Textbox(label="Custom Model", placeholder="Enter a custom model ID here", interactive=True)

    # Tab for chat interface
    with gr.Tab("Chat"):
        with gr.Row():
            with gr.Column():
                txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False)
        
        # Additional parameters
        with gr.Row():
            with gr.Column():
                system_message = gr.Textbox(label="System Message", value="", lines=3)
                max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max New Tokens")
                temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
                top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
                frequency_penalty = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
                seed = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")

        # Chatbot display
        chatbot = gr.Chatbot(height=600)

        # Submit button
        submit_btn = gr.Button("Submit")

    # Tab for information
    with gr.Tab("Information"):
        with gr.Row():
            gr.Markdown(
                """
                # Featured Models
                
                - **meta-llama/Llama-3.3-70B-Instruct**: A large language model from Meta.
                - **google/flan-t5-xl**: A pretrained encoder-decoder model from Google.
                - **facebook/bart-large-cnn**: A pretrained sequence-to-sequence model from Facebook.
                - **EleutherAI/gpt-neo-2.7B**: A large autoregressive language model from EleutherAI.
                
                # Parameters Overview
                
                - **System Message**: Sets the behavior and context for the assistant.
                - **Max New Tokens**: Limits the length of the generated response.
                - **Temperature**: Controls the randomness of the output. Higher values make output more random.
                - **Top-P**: Controls the diversity of text by selecting tokens that account for top-p probability mass.
                - **Frequency Penalty**: Decreases the model's likelihood to repeat the same lines.
                - **Seed**: Ensures reproducibility of results; set to -1 for random seed.
                """
            )

    # Function to handle chat submission
    def user(user_message, history):
        return "", history + [[user_message, None]]

    # Function to process the chat
    def bot(history, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, selected_model):
        # Get the last user message
        user_message = history[-1][0]
        # Generate response
        response_iter = respond(
            user_message,
            history[:-1],  # Exclude the last user message which doesn't have a response yet
            system_message,
            max_tokens,
            temperature,
            top_p,
            frequency_penalty,
            seed,
            selected_model,
        )
        # Collect the entire response
        full_response = ""
        for resp in response_iter:
            full_response = resp
        # Update history with the bot's response
        history[-1][1] = full_response
        return history

    # Set up the chat flow
    txt.submit(user, [txt, chatbot], [txt, chatbot], queue=False).then(
        bot, [chatbot, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, model], chatbot
    )
    submit_btn.click(user, [txt, chatbot], [txt, chatbot], queue=False).then(
        bot, [chatbot, system_message, max_tokens, temperature, top_p, frequency_penalty, seed, model], chatbot
    )

print("Gradio interface initialized.")

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()