import gradio as gr
from openai import OpenAI
import os

# -------------------
# SERVERLESS-TEXTGEN-HUB
# -------------------
#
# This version has been updated to include an "Information" tab above the Chat tab.
# The Information tab has two accordions:
#   - "Featured Models" which displays a simple table
#   - "Parameters Overview" which contains markdown describing the settings
#
# The Chat tab contains the existing chatbot UI.

# -------------------
# SETUP AND CONFIG
# -------------------

# Retrieve the access token from the environment variable
ACCESS_TOKEN = os.getenv("HF_TOKEN")
print("Access token loaded.")

# Initialize the OpenAI-like client (Hugging Face Inference API) with your token
client = OpenAI(
    base_url="https://api-inference.huggingface.co/v1/",
    api_key=ACCESS_TOKEN,
)
print("OpenAI client initialized.")

def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    frequency_penalty,
    seed,
    custom_model
):
    """
    This function handles the chatbot response. It takes in:
    - message: the user's new message
    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
    - system_message: the system prompt
    - max_tokens: the maximum number of tokens to generate in the response
    - temperature: sampling temperature
    - top_p: top-p (nucleus) sampling
    - frequency_penalty: penalize repeated tokens in the output
    - seed: a fixed seed for reproducibility; -1 will mean 'random'
    - custom_model: the final model name in use, which may be set by selecting from the Featured Models radio or by typing a custom model
    """

    print(f"Received message: {message}")
    print(f"History: {history}")
    print(f"System message: {system_message}")
    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
    print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
    print(f"Selected model (custom_model): {custom_model}")

    # Convert seed to None if -1 (meaning random)
    if seed == -1:
        seed = None

    # Construct the messages array required by the HF Inference API
    messages = [{"role": "system", "content": system_message}]
    print("Initial messages array constructed.")

    # Add conversation history to the context
    for val in history:
        user_part = val[0]  # Extract user message from the tuple
        assistant_part = val[1]  # Extract assistant message
        if user_part:
            messages.append({"role": "user", "content": user_part})
            print(f"Added user message to context: {user_part}")
        if assistant_part:
            messages.append({"role": "assistant", "content": assistant_part})
            print(f"Added assistant message to context: {assistant_part}")

    # Append the latest user message
    messages.append({"role": "user", "content": message})
    print("Latest user message appended.")

    # If user provided a model, use that; otherwise, fall back to a default model
    model_to_use = custom_model.strip() if custom_model.strip() != "" else "meta-llama/Llama-3.3-70B-Instruct"
    print(f"Model selected for inference: {model_to_use}")

    # Start with an empty string to build the streamed response
    response_text = ""
    print("Sending request to Hugging Face Inference API via OpenAI-like client...")

    # Make the streaming request to the HF Inference API
    for message_chunk in client.chat.completions.create(
        model=model_to_use,              
        max_tokens=max_tokens,           
        stream=True,                     
        temperature=temperature,         
        top_p=top_p,                     
        frequency_penalty=frequency_penalty,
        seed=seed,                       
        messages=messages,
    ):
        # Extract the token text from the response chunk
        token_text = message_chunk.choices[0].delta.content
        print(f"Received token: {token_text}")
        response_text += token_text
        # Yield the partial response to Gradio so it can display in real-time
        yield response_text

    print("Completed response generation.")

# ----------------------
# BUILDING THE INTERFACE
# ----------------------

# We will use a "Blocks" layout with two tabs:
#   1) "Information" tab, which shows helpful info and a table of "Featured Models"
#   2) "Chat" tab, which holds our ChatInterface and associated controls

with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
    
    # -----------------
    # TAB: INFORMATION
    # -----------------
    with gr.Tab("Information"):
        # You can add instructions, disclaimers, or helpful text here
        gr.Markdown("## Welcome to Serverless-TextGen-Hub - Information")

        # Accordion for Featured Models (table)
        with gr.Accordion("Featured Models (WiP)", open=False):
            gr.HTML(
                """
                <p><a href="https://huggingface.co/models?inference=warm&pipeline_tag=chat&sort=trending" target="_blank">See all available text models on Hugging Face</a></p>
                <table style="width:100%; text-align:center; margin:auto;">
                    <tr>
                        <th>Model Name</th>
                        <th>Supported</th>
                        <th>Notes</th>
                    </tr>
                    <tr>
                        <td>meta-llama/Llama-3.3-70B-Instruct</td>
                        <td>✅</td>
                        <td>Default model, if none is provided in the 'Custom Model' box.</td>
                    </tr>
                    <tr>
                        <td>meta-llama/Llama-3.2-3B-Instruct</td>
                        <td>✅</td>
                        <td>Smaller Llama-based instruct model for faster responses.</td>
                    </tr>
                    <tr>
                        <td>microsoft/Phi-3.5-mini-instruct</td>
                        <td>✅</td>
                        <td>A smaller instruct model from Microsoft.</td>
                    </tr>
                    <tr>
                        <td>Qwen/Qwen2.5-72B-Instruct</td>
                        <td>✅</td>
                        <td>Large-scale Qwen-based model.</td>
                    </tr>
                </table>
                """
            )

        # Accordion for Parameters Overview
        with gr.Accordion("Parameters Overview", open=False):
            gr.Markdown(
                """
                **Here is a brief overview of the main parameters for text generation:**

                - **Max Tokens**: The maximum number of tokens (think of these as word-pieces) the model will generate in its response.
                - **Temperature**: Controls how "creative" or random the output is. Lower values = more deterministic, higher values = more varied.
                - **Top-P**: Similar to temperature, but uses nucleus sampling. Top-P defines the probability mass of the tokens to sample from. For example, `top_p=0.9` means "use the top 90% probable tokens."
                - **Frequency Penalty**: A higher penalty discourages repeated tokens, helping reduce repetitive answers.
                - **Seed**: You can set a seed for deterministic results. `-1` means random each time.

                **Featured Models** can also be selected. If you want to override the model, you may specify a custom Hugging Face model path in the "Custom Model" text box.

                ---
                If you are new to text-generation parameters, the defaults are a great place to start!
                """
            )
    
    # -----------
    # TAB: CHAT
    # -----------
    with gr.Tab("Chat"):
        gr.Markdown("## Chat with the TextGen Model")

        # Create a Chatbot component with a specified height
        chatbot = gr.Chatbot(height=600)
        print("Chatbot interface created.")

        # Create textboxes and sliders for system prompt, tokens, and other parameters
        system_message_box = gr.Textbox(
            value="",
            label="System message",
            info="You can use this to provide instructions or context to the assistant. Leave empty if not needed."
        )

        max_tokens_slider = gr.Slider(
            minimum=1,
            maximum=4096,
            value=512,
            step=1,
            label="Max new tokens",
            info="Controls the maximum length of the output. Keep an eye on your usage!"
        )

        temperature_slider = gr.Slider(
            minimum=0.1,
            maximum=4.0,
            value=0.7,
            step=0.1,
            label="Temperature",
            info="Controls creativity. Higher values = more random replies, lower = more deterministic."
        )

        top_p_slider = gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-P",
            info="Use nucleus sampling with probability mass cutoff. 1.0 includes all tokens."
        )

        frequency_penalty_slider = gr.Slider(
            minimum=-2.0,
            maximum=2.0,
            value=0.0,
            step=0.1,
            label="Frequency Penalty",
            info="Penalize repeated tokens to avoid repetition in output."
        )

        seed_slider = gr.Slider(
            minimum=-1,
            maximum=65535,
            value=-1,
            step=1,
            label="Seed (-1 for random)",
            info="Fixing a seed (0 to 65535) can make results reproducible. -1 picks a random seed each time."
        )

        # The custom_model_box is what the respond function sees as "custom_model"
        custom_model_box = gr.Textbox(
            value="",
            label="Custom Model",
            info="(Optional) Provide a custom Hugging Face model path. Overrides any selected featured model."
        )

        # Function to update the custom model box when a featured model is selected
        def set_custom_model_from_radio(selected):
            print(f"Featured model selected: {selected}")
            return selected

        print("ChatInterface object created.")

        # The main ChatInterface call
        chat_interface = gr.ChatInterface(
            fn=respond,  # The function to handle responses
            additional_inputs=[
                system_message_box,
                max_tokens_slider,
                temperature_slider,
                top_p_slider,
                frequency_penalty_slider,
                seed_slider,
                custom_model_box
            ],
            fill_height=True,  # Let the chatbot fill the container height
            chatbot=chatbot,   # The Chatbot UI component
            theme="Nymbo/Nymbo_Theme",
        )

        print("Gradio interface for Chat created.")

        # -----------
        # ADDING THE "FEATURED MODELS" ACCORDION (Same logic as before)
        # -----------
        with gr.Accordion("Featured Models", open=False):
            model_search_box = gr.Textbox(
                label="Filter Models",
                placeholder="Search for a featured model...",
                lines=1
            )
            print("Model search box created.")

            # Sample list of popular text models
            models_list = [
                "meta-llama/Llama-3.3-70B-Instruct",
                "meta-llama/Llama-3.2-3B-Instruct",
                "meta-llama/Llama-3.2-1B-Instruct",
                "meta-llama/Llama-3.1-8B-Instruct",
                "NousResearch/Hermes-3-Llama-3.1-8B",
                "google/gemma-2-27b-it",
                "google/gemma-2-9b-it",
                "google/gemma-2-2b-it",
                "mistralai/Mistral-Nemo-Instruct-2407",
                "mistralai/Mixtral-8x7B-Instruct-v0.1",
                "mistralai/Mistral-7B-Instruct-v0.3",
                "Qwen/Qwen2.5-72B-Instruct",
                "Qwen/QwQ-32B-Preview",
                "PowerInfer/SmallThinker-3B-Preview",
                "HuggingFaceTB/SmolLM2-1.7B-Instruct",
                "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                "microsoft/Phi-3.5-mini-instruct",
            ]
            print("Models list initialized.")

            featured_model_radio = gr.Radio(
                label="Select a model below",
                choices=models_list,
                value="meta-llama/Llama-3.3-70B-Instruct",
                interactive=True
            )
            print("Featured models radio button created.")

            def filter_models(search_term):
                print(f"Filtering models with search term: {search_term}")
                filtered = [m for m in models_list if search_term.lower() in m.lower()]
                print(f"Filtered models: {filtered}")
                return gr.update(choices=filtered)

            model_search_box.change(
                fn=filter_models,
                inputs=model_search_box,
                outputs=featured_model_radio
            )
            print("Model search box change event linked.")

            featured_model_radio.change(
                fn=set_custom_model_from_radio,
                inputs=featured_model_radio,
                outputs=custom_model_box
            )
            print("Featured model radio button change event linked.")

print("Gradio interface initialized.")

# ------------------------
# MAIN ENTRY POINT
# ------------------------
if __name__ == "__main__":
    print("Launching the demo application.")
    demo.launch()