Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 6,005 Bytes

038f313
 
4c18bfc
038f313
880ced6
 
e13eb1b
038f313
e13eb1b
038f313
 
 
 
e13eb1b
038f313
 
 
e13eb1b
69b4a5f
038f313
 
 
3a64d68
e13eb1b
c20c4dd
 
038f313
e13eb1b
c20c4dd
e13eb1b
f7c4208
 
86297f5
c20c4dd
f7c4208
c20c4dd
 
 
038f313
e13eb1b
880ced6
f7c4208
 
e13eb1b
 
 
 
 
 
86297f5
e13eb1b
 
 
 
038f313
 
e13eb1b
038f313
b56d11c
f7c4208
c20c4dd
e13eb1b
c20c4dd
038f313
c20c4dd
038f313
 
c20c4dd
 
86297f5
038f313
f7c4208
86297f5
b56d11c
 
 
542c2ac
e13eb1b
f7c4208
c20c4dd
e13eb1b
 
 
c20c4dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b56d11c
c20c4dd
b56d11c
c20c4dd
e13eb1b
c20c4dd
b56d11c
c20c4dd
 
b56d11c
c20c4dd
 
 
b56d11c
c20c4dd
 
b56d11c
c20c4dd
86297f5
 
c20c4dd
86297f5
 
c20c4dd
 
 
 
 
 
 
 
 
e13eb1b
e4bb2d0
86297f5
c20c4dd
 
 
 
 
 
 
 
e4bb2d0
e13eb1b
c20c4dd

import gradio as gr
from openai import OpenAI
import os

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    model,
    custom_model
):
    """
    Handles the chatbot response with given parameters.
    """
    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Model: {model}, Custom Model: {custom_model}")

    # Use custom model if provided, else use selected model
    selected_model = custom_model.strip() if custom_model.strip() else model
    print(f"Selected model: {selected_model}")

    # Construct the messages array required by the API
    messages = [{"role": "system", "content": system_message}]

    # Add conversation history to the context
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})

    # Start with an empty string to build the response as tokens stream in
    response = ""
    print("Sending request to OpenAI API.")

    # Make the streaming request to the HF Inference API via OpenAI-like client
    for message_chunk in client.chat.completions.create(
        model=selected_model,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed if seed != -1 else None,
        messages=messages,
    ):
        # Extract the token text from the response chunk
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response += token_text
        yield response

    print("Completed response generation.")

# Create a Chatbot component
chatbot = gr.Chatbot(height=600)
print("Chatbot interface created.")

# Define the featured models for the dropdown
models_list = [
    "meta-llama/Llama-3.3-70B-Instruct",
    "bigscience/bloom-176b",
    "gpt-j-6b",
    "opt-30b",
    "flan-t5-xxl",
]

# Function to filter models based on user input
def filter_models(search_term):
    return [m for m in models_list if search_term.lower() in m.lower()]

# Gradio interface
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    with gr.Row():
        chatbot = gr.Chatbot(height=600)

    with gr.Tab("Chat Interface"):
        with gr.Row():
            user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...")
        with gr.Row():
            system_message = gr.Textbox(value="", label="System Message")
        with gr.Row():
            max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max Tokens")
            temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
        with gr.Row():
            top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-P")
            frequency_penalty = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
            seed = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")
        with gr.Row():
            model = gr.Dropdown(label="Select a Model", choices=models_list, value="meta-llama/Llama-3.3-70B-Instruct")
            custom_model = gr.Textbox(label="Custom Model", placeholder="Enter custom model path")
        with gr.Row():
            run_button = gr.Button("Generate Response")

    with gr.Tab("Information"):
        with gr.Accordion("Featured Models", open=False):
            gr.HTML(
                """
                <table>
                    <tr><th>Model Name</th><th>Description</th></tr>
                    <tr><td>meta-llama/Llama-3.3-70B-Instruct</td><td>Instruction-tuned LLaMA model</td></tr>
                    <tr><td>bigscience/bloom-176b</td><td>Multilingual large language model</td></tr>
                    <tr><td>gpt-j-6b</td><td>Open-source GPT model</td></tr>
                    <tr><td>opt-30b</td><td>Meta's OPT model</td></tr>
                    <tr><td>flan-t5-xxl</td><td>Google's Flan-tuned T5 XXL</td></tr>
                </table>
                """
            )
        with gr.Accordion("Parameters Overview", open=False):
            gr.Markdown(
                """
                ### Parameters Overview
                - **Max Tokens**: Maximum number of tokens in the response.
                - **Temperature**: Controls the randomness of responses. Lower values make the output more deterministic.
                - **Top-P**: Controls the diversity of responses by limiting the token selection to a probability mass.
                - **Frequency Penalty**: Penalizes repeated tokens in the output.
                - **Seed**: Fixes randomness for reproducibility. Use -1 for a random seed.
                """
            )

    run_button.click(
        respond,
        inputs=[
            user_input,
            chatbot.state,
            system_message,
            max_tokens,
            temperature,
            top_p,
            frequency_penalty,
            seed,
            model,
            custom_model
        ],
        outputs=chatbot
    )

print("Launching the demo application.")
demo.launch()