Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 7,780 Bytes

038f313
fab24df
c5a20a4
038f313
880ced6
 
e13eb1b
038f313
e13eb1b
038f313
 
 
 
e13eb1b
038f313
 
27c8b8d
 
 
038f313
 
 
3a64d68
98674ca
c5a20a4
038f313
e13eb1b
7255410
 
 
 
 
 
 
 
 
be3f346
e13eb1b
7255410
27c8b8d
 
 
 
 
be3f346
f7c4208
c5a20a4
52ad57a
 
038f313
c5a20a4
 
27c8b8d
c5a20a4
27c8b8d
 
 
 
 
 
 
 
 
 
c5a20a4
27c8b8d
 
be3f346
c5a20a4
77298b9
 
c5a20a4
27c8b8d
 
 
c5a20a4
27c8b8d
be3f346
27c8b8d
c5a20a4
27c8b8d
 
 
 
 
 
 
 
 
 
c5a20a4
27c8b8d
542c2ac
e13eb1b
f7c4208
be3f346
 
 
 
c5a20a4
 
 
 
be3f346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5a20a4
 
be3f346
c5a20a4
be3f346
 
 
 
 
 
 
c5a20a4
 
 
 
27c8b8d
e7683ca
be3f346
 
 
769901b
 
be3f346
769901b
 
 
 
be3f346
 
769901b
 
20be021
 
4f6c64c
20be021
 
 
 
 
 
4f6c64c
 
20be021
 
 
 
 
769901b
be3f346
 
769901b
 
 
 
 
 
be3f346
769901b
 
 
 
be3f346
 
 
 
 
 
769901b
be3f346
 
 
 
 
 
 
 
769901b
77298b9
27c8b8d
77298b9

import gradio as gr
from openai import OpenAI
import os

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    custom_model
):
    """
    This function handles the chatbot response. It takes in:
    - message: the user's new message
    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
    - system_message: the system prompt
    - max_tokens: the maximum number of tokens to generate in the response
    - temperature: sampling temperature
    - top_p: top-p (nucleus) sampling
    - frequency_penalty: penalize repeated tokens in the output
    - seed: a fixed seed for reproducibility; -1 will mean 'random'
    - custom_model: the final model name in use, which may be set by selecting from the Featured Models radio or by typing a custom model
    """

    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Selected model (custom_model): {custom_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Construct the messages array required by the API
    messages = [{"role": "system", "content": system_message}]

    # Add conversation history to the context
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})

    # If user provided a model, use that; otherwise, fall back to a default
    model_to_use = custom_model.strip() if custom_model.strip() != "" else "meta-llama/Llama-3.3-70B-Instruct"
    print(f"Model selected for inference: {model_to_use}")

    # Start with an empty string to build the response as tokens stream in
    response = ""
    print("Sending request to OpenAI API.")

    # Make the streaming request to the HF Inference API via openai-like client
    for message_chunk in client.chat.completions.create(
        model=model_to_use,              # Use either the user-provided or default model
        max_tokens=max_tokens,
        stream=True,                     # Stream the response
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        # Extract the token text from the response chunk
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response += token_text
        # Yield the partial response to Gradio so it can display in real-time
        yield response

    print("Completed response generation.")

# -------------------------
# GRADIO UI CONFIGURATION
# -------------------------

# Create a Chatbot component with a specified height
chatbot = gr.Chatbot(height=600)
print("Chatbot interface created.")

# We'll create text boxes & sliders for system prompt, tokens, etc.
system_message_box = gr.Textbox(value="", label="System message")

max_tokens_slider = gr.Slider(
    minimum=1,
    maximum=4096,
    value=512,
    step=1,
    label="Max new tokens"
)
temperature_slider = gr.Slider(
    minimum=0.1,
    maximum=4.0,
    value=0.7,
    step=0.1,
    label="Temperature"
)
top_p_slider = gr.Slider(
    minimum=0.1,
    maximum=1.0,
    value=0.95,
    step=0.05,
    label="Top-P"
)
frequency_penalty_slider = gr.Slider(
    minimum=-2.0,
    maximum=2.0,
    value=0.0,
    step=0.1,
    label="Frequency Penalty"
)
seed_slider = gr.Slider(
    minimum=-1,
    maximum=65535,
    value=-1,
    step=1,
    label="Seed (-1 for random)"
)

# The custom_model_box is what the respond function sees as "custom_model"
custom_model_box = gr.Textbox(
    value="",
    label="Custom Model",
    info="(Optional) Provide a custom Hugging Face model path. Overrides any selected featured model."
)

# Define a function that, when a user selects a model from the radio, populates `custom_model_box`
def set_custom_model_from_radio(selected):
    """
    This function will get triggered whenever someone picks a model from the 'Featured Models' radio.
    We will update the Custom Model text box with that selection automatically.
    """
    return selected

# The main ChatInterface object
demo = gr.ChatInterface(
    fn=respond,
    # For ChatInterface, we can pass additional inputs in order to feed them into the "respond" function
    additional_inputs=[
        system_message_box,
        max_tokens_slider,
        temperature_slider,
        top_p_slider,
        frequency_penalty_slider,
        seed_slider,
        custom_model_box
    ],
    fill_height=True,
    chatbot=chatbot,
    theme="Nymbo/Nymbo_Theme",
)

# -----------
# ADDING THE "FEATURED MODELS" ACCORDION
# -----------
with demo:
    with gr.Accordion("Featured Models", open=False):
        model_search_box = gr.Textbox(
            label="Filter Models",
            placeholder="Search for a featured model...",
            lines=1
        )

        # Sample list of popular text models
        models_list = [
            "meta-llama/Llama-3.3-70B-Instruct",
            "meta-llama/Llama-3.2-3B-Instruct",
            "meta-llama/Llama-3.2-1B-Instruct",
            "meta-llama/Llama-3.1-8B-Instruct",
            "NousResearch/Hermes-3-Llama-3.1-8B",
            "google/gemma-2-27b-it",
            "google/gemma-2-9b-it",
            "google/gemma-2-2b-it",
            "mistralai/Mistral-Nemo-Instruct-2407",
            "mistralai/Mixtral-8x7B-Instruct-v0.1",
            "mistralai/Mistral-7B-Instruct-v0.3",
            "Qwen/Qwen2.5-72B-Instruct",
            "Qwen/QwQ-32B-Preview",
            "PowerInfer/SmallThinker-3B-Preview",
            "HuggingFaceTB/SmolLM2-1.7B-Instruct",
            "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            "microsoft/Phi-3.5-mini-instruct",
        ]

        featured_model_radio = gr.Radio(
            label="Select a model below",
            choices=models_list,
            value="meta-llama/Llama-3.3-70B-Instruct",
            interactive=True
        )

        # Filter function for the radio
        def filter_models(search_term):
            filtered = [m for m in models_list if search_term.lower() in m.lower()]
            return gr.update(choices=filtered)

        # Whenever we type in the search box, update the radio with the filtered list
        model_search_box.change(
            fn=filter_models,
            inputs=model_search_box,
            outputs=featured_model_radio
        )

        # Whenever we select a featured model, populate the 'Custom Model' textbox
        featured_model_radio.change(
            fn=set_custom_model_from_radio,
            inputs=featured_model_radio,
            outputs=custom_model_box
        )

print("Gradio interface initialized.")

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()