Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 10,078 Bytes

import gradio as gr
from openai import OpenAI
import os

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    custom_model,
    selected_featured_model
):
    """
    This function handles the chatbot response. It takes in:
    - message: the user's new message
    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
    - system_message: the system prompt
    - max_tokens: the maximum number of tokens to generate in the response
    - temperature: sampling temperature
    - top_p: top-p (nucleus) sampling
    - frequency_penalty: penalize repeated tokens in the output
    - seed: a fixed seed for reproducibility; -1 will mean 'random'
    - custom_model: the user-provided custom model name (if any)
    - selected_featured_model: the model selected from featured models
    """

    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Custom model: {custom_model}")
    print(f"Selected featured model: {selected_featured_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Determine which model to use: either custom_model or selected featured model
    if custom_model.strip() != "":
        model_to_use = custom_model.strip()
        print(f"Using Custom Model: {model_to_use}")
    else:
        model_to_use = selected_featured_model
        print(f"Using Featured Model: {model_to_use}")

    # Construct the messages array required by the API
    messages = [{"role": "system", "content": system_message}]

    # Add conversation history to the context
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})

    # Start with an empty string to build the response as tokens stream in
    response = ""
    print("Sending request to OpenAI API.")

    try:
        # Make the streaming request to the HF Inference API via openai-like client
        for message_chunk in client.chat.completions.create(
            model=model_to_use,              # Use either the user-provided custom model or selected featured model
            max_tokens=max_tokens,
            stream=True,                     # Stream the response
            temperature=temperature,
            top_p=top_p,
            frequency_penalty=frequency_penalty,
            seed=seed,
            messages=messages,
        ):
            # Extract the token text from the response chunk
            token_text = message_chunk.choices[0].delta.content
            print(f"Received token: {token_text}")
            response += token_text
            # Yield the partial response to Gradio so it can display in real-time
            yield response
    except Exception as e:
        print(f"Error during API call: {e}")
        yield f"An error occurred: {e}"

    print("Completed response generation.")

# Create a Chatbot component with a specified height
chatbot = gr.Chatbot(height=600)
print("Chatbot interface created.")

# Placeholder featured models list
FEATURED_MODELS_LIST = [
    "meta-llama/Llama-3.1-8B-Instruct",
    "microsoft/Phi-3.5-mini-instruct",
    "mistralai/Mistral-7B-Instruct-v0.3",
    "Qwen/Qwen2.5-72B-Instruct",
]

# Define the Gradio Blocks interface
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    gr.Markdown("# Serverless-TextGen-Hub 📝🤖")
    gr.Markdown(
        """
        Welcome to the **Serverless-TextGen-Hub**! Chat with your favorite models seamlessly.
        """
    )
    
    with gr.Row():
        # Chatbot component
        chatbot_component = gr.Chatbot(height=600)

    with gr.Row():
        # System message input
        system_message = gr.Textbox(
            value="You are a helpful assistant.",
            label="System Message",
            placeholder="Enter system message here...",
            lines=2,
        )

    with gr.Row():
        # User message input
        user_message = gr.Textbox(
            label="Your Message",
            placeholder="Type your message here...",
            lines=2,
        )
        # Run button
        run_button = gr.Button("Send", variant="primary")

    with gr.Row():
        # Additional settings
        with gr.Column(scale=1):
            max_tokens = gr.Slider(
                minimum=1,
                maximum=4096,
                value=512,
                step=1,
                label="Max New Tokens",
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=4.0,
                value=0.7,
                step=0.1,
                label="Temperature",
            )
            top_p = gr.Slider(
                minimum=0.1,
                maximum=1.0,
                value=0.95,
                step=0.05,
                label="Top-P",
            )
            frequency_penalty = gr.Slider(
                minimum=-2.0,
                maximum=2.0,
                value=0.0,
                step=0.1,
                label="Frequency Penalty",
            )
            seed = gr.Slider(
                minimum=-1,
                maximum=65535,  # Arbitrary upper limit for demonstration
                value=-1,
                step=1,
                label="Seed (-1 for random)",
            )
            custom_model = gr.Textbox(
                value="",
                label="Custom Model",
                info="(Optional) Provide a custom Hugging Face model path. This will override the selected featured model if not empty.",
                placeholder="e.g., meta-llama/Llama-3.3-70B-Instruct",
            )

    with gr.Accordion("Featured Models", open=True):
        with gr.Column():
            model_search = gr.Textbox(
                label="Filter Models",
                placeholder="Search for a featured model...",
                lines=1,
            )
            featured_model = gr.Radio(
                label="Select a model below",
                value=FEATURED_MODELS_LIST[0],
                choices=FEATURED_MODELS_LIST,
                interactive=True,
            )

    # Function to filter featured models based on search input
    def filter_featured_models(search_term):
        if not search_term:
            return gr.update(choices=FEATURED_MODELS_LIST, value=FEATURED_MODELS_LIST[0])
        filtered = [model for model in FEATURED_MODELS_LIST if search_term.lower() in model.lower()]
        if not filtered:
            return gr.update(choices=[], value=None)
        return gr.update(choices=filtered, value=filtered[0])

    # Update featured_model choices based on search
    model_search.change(
        fn=filter_featured_models,
        inputs=model_search,
        outputs=featured_model,
    )

    # Function to handle the chatbot response
    def handle_response(message, history, system_msg, max_tok, temp, tp, freq_pen, sd, custom_mod, selected_feat_mod):
        # Append user message to history
        history = history or []
        history.append((message, None))
        # Generate response using the respond function
        response = respond(
            message=message,
            history=history,
            system_message=system_msg,
            max_tokens=max_tok,
            temperature=temp,
            top_p=tp,
            frequency_penalty=freq_pen,
            seed=sd,
            custom_model=custom_mod,
            selected_featured_model=selected_feat_mod,
        )
        return response, history + [(message, response)]

    # Handle button click
    run_button.click(
        fn=handle_response,
        inputs=[
            user_message,
            chatbot_component,        # history
            system_message,
            max_tokens,
            temperature,
            top_p,
            frequency_penalty,
            seed,
            custom_model,
            featured_model,
        ],
        outputs=[
            chatbot_component,
            chatbot_component,        # Updated history
        ],
    )

    # Allow pressing Enter to send the message
    user_message.submit(
        fn=handle_response,
        inputs=[
            user_message,
            chatbot_component,        # history
            system_message,
            max_tokens,
            temperature,
            top_p,
            frequency_penalty,
            seed,
            custom_model,
            featured_model,
        ],
        outputs=[
            chatbot_component,
            chatbot_component,        # Updated history
        ],
    )

    # Custom CSS to enhance the UI
    demo.load(lambda: None, None, None, _js="""
    () => {
        const style = document.createElement('style');
        style.innerHTML = `
            footer {visibility: hidden !important;}
            .gradio-container {background-color: #f9f9f9;}
        `;
        document.head.appendChild(style);
    }
    """)

print("Launching Gradio interface...")  # Debug log

# Launch the Gradio interface without showing the API or sharing externally
demo.launch(show_api=False, share=False)