Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

File size: 9,411 Bytes

038f313
 
4c18bfc
fde397b
038f313
880ced6
 
038f313
 
4c18bfc
038f313
 
 
 
 
 
 
 
 
69b4a5f
038f313
 
 
3a64d68
d735dab
fde397b
f7c4208
7d3730f
038f313
5b1509d
f7c4208
 
 
 
 
 
 
 
 
fde397b
 
 
a430d0d
d735dab
f7c4208
 
 
 
 
fde397b
f7c4208
 
5b1509d
 
038f313
f7c4208
880ced6
f7c4208
 
038f313
f7c4208
 
 
 
 
 
 
 
 
 
038f313
 
fde397b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f7c4208
038f313
fde397b
f7c4208
 
5b1509d
fde397b
038f313
f7c4208
038f313
 
21137c4
 
038f313
 
f7c4208
5b1509d
f7c4208
038f313
fde397b
 
 
 
f7c4208
fde397b
542c2ac
fde397b
f7c4208
fde397b
 
 
 
21137c4
 
fde397b
 
 
f7c4208
d735dab
f7c4208
 
 
 
fde397b
 
 
f7c4208
 
 
 
 
 
 
 
 
fde397b
f7c4208
 
 
 
fde397b
 
 
f7c4208
fde397b
f7c4208
 
 
 
7d3730f
fde397b

import gradio as gr
from openai import OpenAI
import os
import time

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    model_filter,
    model,
    custom_model
):
    """
    This function handles the chatbot response. It takes in:
    - message: the user's new message
    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
    - system_message: the system prompt
    - max_tokens: the maximum number of tokens to generate in the response
    - temperature: sampling temperature
    - top_p: top-p (nucleus) sampling
    - frequency_penalty: penalize repeated tokens in the output
    - seed: a fixed seed for reproducibility; -1 will mean 'random'
    - model_filter: search term to filter available models
    - model: the selected model from the radio choices
    - custom_model: manually entered HF model path
    """

    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Model Filter: {model_filter}, Selected Model: {model}, Custom Model: {custom_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Construct the messages array required by the API
    messages = [{"role": "system", "content": system_message}]

    # Add conversation history to the context
    for val in history:
        user_part = val[0]
        assistant_part = val[1]
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})

    # Determine the model to use
    # Set the API URL based on the selected model or custom model
    if custom_model.strip() != "":
        api_model = custom_model.strip()
    else:
        if model == "Llama-3-70B-Instruct":
            api_model = "meta-llama/Llama-3.3-70B-Instruct"
        elif model == "Mistral-7B-Instruct-v0.2":
            api_model = "mistralai/Mistral-7B-Instruct-v0.2"
        elif model == "OpenHermes-2.5-Mistral-7B":
            api_model = "teknium/OpenHermes-2.5-Mistral-7B"
        elif model == "Phi-2":
            api_model = "microsoft/Phi-2"
        else:
            api_model = "meta-llama/Llama-3.3-70B-Instruct"
        print(f"Using model: {api_model}")

    # Start with an empty string to build the response as tokens stream in
    response = ""
    print(f"Sending request to OpenAI API, using model {api_model}.")

    # Make the streaming request to the HF Inference API via openai-like client
    for message_chunk in client.chat.completions.create(
        model=api_model,
        max_tokens=max_tokens,
        stream=True,  # Stream the response
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        # Extract the token text from the response chunk
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")

        # Check if token_text is None before appending
        if token_text is not None:
            response += token_text
            yield response

    print("Completed response generation.")

# Placeholder list of models for the accordion
models_list = [
    "Llama-3-70B-Instruct",
    "Mistral-7B-Instruct-v0.2",
    "OpenHermes-2.5-Mistral-7B",
    "Phi-2",
]

# Create a Chatbot component with a specified height
chatbot = gr.Chatbot(height=600)
print("Chatbot interface created.")

# Create the Gradio ChatInterface
demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Textbox(value="", label="System message"),
        gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
        gr.Slider(
            minimum=-2.0,
            maximum=2.0,
            value=0.0,
            step=0.1,
            label="Frequency Penalty"
        ),
        gr.Slider(
            minimum=-1,
            maximum=65535,
            value=-1,
            step=1,
            label="Seed (-1 for random)"
        ),
        gr.Textbox(label="Filter Featured Models", placeholder="Search...", lines=1),
        gr.Radio(label="Select a Featured Model", choices=models_list, value="Llama-3-70B-Instruct"),
        gr.Textbox(label="Custom Model", placeholder="Enter Hugging Face model path", lines=1),
    ],
    additional_inputs_accordion=gr.Accordion("Advanced Parameters", open=False),
    fill_height=True,
    chatbot=chatbot,
    theme="Nymbo/Nymbo_Theme",
)

# Add the "Information" tab to the demo
with gr.Tab("Information", parent=demo):
    with gr.Accordion("Featured Models", open=True):
        gr.HTML(
            """
        <table style="width:100%; text-align:center; margin:auto;">
            <tr>
                <th>Model Name</th>
                <th>Provider</th>
                <th>Notes</th>
            </tr>
            <tr>
                <td>Llama-3-70B-Instruct</td>
                <td>Meta</td>
                <td>Powerful large language model.</td>
            </tr>
            <tr>
                <td>Mistral-7B-Instruct-v0.2</td>
                <td>Mistral AI</td>
                <td>Efficient and versatile model.</td>
            </tr>
            <tr>
                <td>OpenHermes-2.5-Mistral-7B</td>
                <td>Teknium</td>
                <td>Community-driven, fine-tuned model.</td>
            </tr>
            <tr>
                <td>Phi-2</td>
                <td>Microsoft</td>
                <td>Compact yet powerful model.</td>
            </tr>
        </table>
        """
        )
    with gr.Accordion("Parameters Overview", open=False):
        gr.Markdown(
        """
        ## System Message
        ###### The system message sets the behavior and persona of the chatbot. It's a way to provide context and instructions to the AI. For example, you can tell it to act as a helpful assistant, a storyteller, or any other role.
        ## Max New Tokens
        ###### This setting limits the length of the response generated by the AI. A higher number allows for longer, more detailed responses, while a lower number keeps the responses concise.
        ## Temperature
        ###### Temperature controls the randomness of the AI's output. A higher temperature makes the responses more creative and varied, while a lower temperature makes them more predictable and focused.
        ## Top-P (Nucleus Sampling)
        ###### Top-P sampling is a way to control the diversity of the AI's responses. It sets a threshold for the cumulative probability of the most likely next words. The AI then randomly selects from the words whose probabilities add up to this threshold. A lower Top-P value means less diversity.
        ## Frequency Penalty
        ###### Frequency penalty discourages the AI from repeating the same words or phrases too often in its responses. A higher penalty means the AI is less likely to repeat itself.
        ## Seed
        ###### The seed is a starting point for the random number generator that influences the AI's responses. If you set a specific seed, you'll get the same response every time you use that seed with the same prompt and settings. If you set it to -1, the AI will generate a new seed each time, leading to different responses.
        ## Featured Models
        ###### This section lists pre-selected models that are known to perform well. You can filter the list by typing in the search box.
        ## Custom Model
        ###### If you want to use a model that's not in the featured list, you can enter its Hugging Face model path here.
        ### Feel free to experiment with these settings to see how they affect the AI's responses. Happy chatting!
        """
        )

# Filter models function
def filter_models(search_term, model_radio):
    filtered_models = [m for m in models_list if search_term.lower() in m.lower()]
    if not filtered_models:
        filtered_models = ["No matching models"]  # Provide feedback
    return gr.Radio.update(choices=filtered_models)

# Update model list when search box is used
demo.additional_inputs[6].change(filter_models, inputs=[demo.additional_inputs[6], demo.additional_inputs[7]], outputs=demo.additional_inputs[7])

print("Gradio interface initialized.")

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.queue().launch()