import gradio as gr
from openai import OpenAI
import os

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI client with the Hugging Face Inference API endpoint
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    model_selection,
    custom_model
):
    """
    This function handles the chatbot response.
    """
    selected_model = custom_model if custom_model.strip() != "" else model_selection
    print(f"Selected model: {selected_model}")

    if seed == -1:
        seed = None

    messages = [{"role": "system", "content": system_message}]
    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})
    messages.append({"role": "user", "content": message})

    response = ""
    for message_chunk in client.chat.completions.create(
        model=selected_model,
        max_tokens=max_tokens,
        stream=True,
        temperature=temperature,
        top_p=top_p,
        frequency_penalty=frequency_penalty,
        seed=seed,
        messages=messages,
    ):
        token_text = message_chunk.choices[0].delta.content
        response += token_text
        yield response

# Create a Chatbot component with a specified height
chatbot = gr.Chatbot(height=600)

# Define placeholder models
featured_models = [
    "meta-llama/Llama-3.3-70B-Instruct",
    "gpt2",
    "bert-base-uncased",
    "facebook/bart-base",
    "google/flan-t5-base"
]

# Create the Gradio ChatInterface
with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    gr.Markdown("# Serverless Text Generation Hub")

    with gr.Tab("Basic Settings"):
        with gr.Row():
            with gr.Column():
                # Textbox for system message
                system_message = gr.Textbox(value="", label="System message")
        with gr.Row():
            with gr.Column():
                # Model selection
                with gr.Accordion("Featured Models", open=True):
                    model_search = gr.Textbox(label="Filter Models", placeholder="Search for a featured model...")
                    model = gr.Radio(label="Select a model", choices=featured_models, value="meta-llama/Llama-3.3-70B-Instruct")

                    def filter_models(search_term):
                        filtered_models = [m for m in featured_models if search_term.lower() in m.lower()]
                        return gr.update(choices=filtered_models)

                    model_search.change(filter_models, inputs=model_search, outputs=model)
        with gr.Row():
            with gr.Column():
                # Custom model input
                custom_model = gr.Textbox(label="Custom Model", placeholder="Enter a custom model name")

    with gr.Tab("Advanced Settings"):
        with gr.Row():
            max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens")
            temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
        with gr.Row():
            top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
            frequency_penalty = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
        with gr.Row():
            seed = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")

    with gr.Tab("Information"):
        with gr.Accordion("Featured Models", open=False):
            gr.Markdown(
                """
                <table style="width:100%; text-align:center; margin:auto;">
                    <tr>
                        <th>Model Name</th>
                        <th>Description</th>
                    </tr>
                    <tr>
                        <td>meta-llama/Llama-3.3-70B-Instruct</td>
                        <td>Highly capable Llama model</td>
                    </tr>
                    <tr>
                        <td>gpt2</td>
                        <td>Generative Pre-trained Transformer 2</td>
                    </tr>
                    <tr>
                        <td>bert-base-uncased</td>
                        <td>Bidirectional Encoder Representations from Transformers</td>
                    </tr>
                </table>
                """
            )
        with gr.Accordion("Parameters Overview", open=False):
            gr.Markdown(
                """
                ## System Message
                ###### Sets the behavior and tone of the assistant.

                ## Max New Tokens
                ###### Determines the maximum length of the response.

                ## Temperature
                ###### Controls the randomness of the output. Lower values make the output more deterministic.

                ## Top-P
                ###### Used for nucleus sampling. Higher values include more tokens in consideration.

                ## Frequency Penalty
                ###### Penalizes the model for repeating the same tokens.

                ## Seed
                ###### Ensures reproducibility of results.
                """
            )

    # Chat interface
    demo = gr.ChatInterface(
        respond,
        additional_inputs=[
            system_message,
            max_tokens,
            temperature,
            top_p,
            frequency_penalty,
            seed,
            model,
            custom_model
        ],
        chatbot=chatbot,
        theme="Nymbo/Nymbo_Theme"
    )

if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()
Model Name	Description
meta-llama/Llama-3.3-70B-Instruct	Highly capable Llama model
gpt2	Generative Pre-trained Transformer 2
bert-base-uncased	Bidirectional Encoder Representations from Transformers