Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 7,361 Bytes

038f313
 
4c18bfc
038f313
880ced6
 
038f313
 
4c18bfc
038f313
 
 
 
 
 
 
 
 
880ced6
69b4a5f
 
038f313
 
 
3a64d68
880ced6
038f313
5b1509d
69b4a5f
a430d0d
038f313
 
69b4a5f
880ced6
69b4a5f
 
5b1509d
 
69b4a5f
5b1509d
 
038f313
69b4a5f
 
 
 
880ced6
4c18bfc
69b4a5f
038f313
5b1509d
 
 
 
 
 
038f313
69b4a5f
038f313
 
69b4a5f
038f313
69b4a5f
880ced6
69b4a5f
5b1509d
880ced6
038f313
880ced6
038f313
 
542c2ac
 
038f313
 
5b1509d
880ced6
5b1509d
038f313
 
 
3a64d68
69b4a5f
4c18bfc
880ced6
542c2ac
69b4a5f
 
 
 
 
 
 
 
 
880ced6
542c2ac
69b4a5f
 
 
 
880ced6
 
 
69b4a5f
880ced6
 
 
 
 
69b4a5f
880ced6
 
69b4a5f
 
 
 
 
 
880ced6
69b4a5f

import gradio as gr
from openai import OpenAI
import os

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    model,
    custom_model,
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed
):
    """
    This function handles the chatbot response.
    """
    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"Model: {model}")
    print(f"Custom model: {custom_model}")
    print(f"System message: {system_message}")
    print(f"Parameters - Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")

    # Convert seed to None if -1
    if seed == -1:
        seed = None

    # Set the model based on selection or custom input
    selected_model = custom_model.strip() if custom_model.strip() != "" else model

    # Construct messages array
    messages = [{"role": "system", "content": system_message}]

    # Add conversation history
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})

    # Append latest message
    messages.append({"role": "user", "content": message})

    # Start with empty response
    response = ""
    print("Sending request to API.")

    # Make the streaming request
    for message_chunk in client.chat.completions.create(
        model=selected_model,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response += token_text
        yield response

    print("Completed response generation.")

# Create Chatbot component
chatbot = gr.Chatbot(height=600)
print("Chatbot interface created.")

# Define available models
models_list = [
    "meta-llama/Llama-2-70b-chat-hf",
    "meta-llama/Llama-2-13b-chat-hf",
    "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "mistralai/Mistral-7B-Instruct-v0.2",
    "HuggingFaceH4/zephyr-7b-beta",
]

# Create the Gradio interface with tabs
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    with gr.Tab("Chat"):
        with gr.Row():
            with gr.Column():
                # Model selection accordion
                with gr.Accordion("Featured Models", open=True):
                    model_search = gr.Textbox(
                        label="Filter Models",
                        placeholder="Search for a model...",
                        lines=1
                    )
                    model = gr.Radio(
                        label="Select a model",
                        choices=models_list,
                        value="meta-llama/Llama-2-70b-chat-hf"
                    )
                
                # Custom model input
                custom_model = gr.Textbox(
                    label="Custom Model",
                    info="Enter Hugging Face model path (optional)",
                    placeholder="organization/model-name"
                )

                # System message and parameters
                system_message = gr.Textbox(label="System message")
                max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens")
                temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
                top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
                frequency_penalty = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
                seed = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")

    with gr.Tab("Information"):
        with gr.Accordion("Featured Models", open=False):
            gr.HTML("""
            <p><a href="https://huggingface.co/models?pipeline_tag=text-generation&sort=trending">See all available models</a></p>
            <table style="width:100%; text-align:center; margin:auto;">
                <tr>
                    <th>Model Name</th>
                    <th>Parameters</th>
                    <th>Notes</th>
                </tr>
                <tr>
                    <td>Llama-2-70b-chat</td>
                    <td>70B</td>
                    <td>Meta's largest chat model</td>
                </tr>
                <tr>
                    <td>Mixtral-8x7B</td>
                    <td>47B</td>
                    <td>Mixture of Experts architecture</td>
                </tr>
                <tr>
                    <td>Mistral-7B</td>
                    <td>7B</td>
                    <td>Efficient base model</td>
                </tr>
            </table>
            """)

        with gr.Accordion("Parameters Overview", open=False):
            gr.Markdown("""
            ## System Message
            The system message sets the context and behavior for the AI assistant. It's like giving it a role or specific instructions.

            ## Max New Tokens
            Controls the maximum length of the generated response. Higher values allow for longer responses but take more time.

            ## Temperature
            Controls randomness in the response:
            - Lower (0.1-0.5): More focused and deterministic
            - Higher (0.7-1.0): More creative and varied

            ## Top-P
            Nucleus sampling parameter:
            - Lower values: More focused on likely tokens
            - Higher values: More diverse vocabulary usage

            ## Frequency Penalty
            Discourages repetition:
            - Negative: May allow more repetition
            - Positive: Encourages more diverse word choice

            ## Seed
            Controls randomness initialization:
            - -1: Random seed each time
            - Fixed value: Reproducible outputs
            """)

    # Function to filter models based on search
    def filter_models(search_term):
        filtered_models = [m for m in models_list if search_term.lower() in m.lower()]
        return gr.update(choices=filtered_models)

    # Connect the search box to the model filter function
    model_search.change(filter_models, inputs=model_search, outputs=model)

    # Create the chat interface
    chat_interface = gr.ChatInterface(
        respond,
        additional_inputs=[
            model,
            custom_model,
            system_message,
            max_tokens,
            temperature,
            top_p,
            frequency_penalty,
            seed,
        ],
        chatbot=chatbot,
    )

print("Gradio interface initialized.")

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch(show_api=False, share=False)