import json
import os
import time
import torch
import gradio as gr
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

# Environment variables
os.environ["TOKENIZERS_PARALLELISM"] = "0"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Load model and tokenizer
def load_model_and_tokenizer(model_name, dtype):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    special_tokens = {"pad_token": "<PAD>"}
    tokenizer.add_special_tokens(special_tokens)
    
    config = AutoConfig.from_pretrained(model_name)
    if dtype == "bf16":
        dtype = torch.bfloat16
    elif dtype == "fp16":
        dtype = torch.float16
    elif dtype == "fp32":
        dtype = torch.float32

    model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=dtype, device_map="auto")

    if len(tokenizer) > model.get_input_embeddings().weight.shape[0]:
        model.resize_token_embeddings(len(tokenizer))

    tokenizer.padding_side = "left"
    model.config.pad_token_id = tokenizer.pad_token_id

    return model, tokenizer

# Format response
def format_response(dialog, response):
    formatted_dialog = dialog.copy()
    formatted_dialog.append({"role": "assistant", "content": response})
    return formatted_dialog

# Load questions
def load_questions(prompts_path, num_questions, custom_question):
    with open(prompts_path, "r") as file:
        dialogs = json.load(file)
    
    if custom_question and custom_question.strip():
        custom_dialog = [{"role": "user", "content": custom_question}]
        dialogs.insert(0, custom_dialog)
    
    dialogs = dialogs[:num_questions]
    return dialogs

# Inference
def infer(model_name, dialogs, num_new_tokens, temperature, dtype):
    model, tokenizer = load_model_and_tokenizer(model_name, dtype)
    batch_inputs = [
        tokenizer.apply_chat_template(dialog, tokenize=False, add_generation_prompt=True)
        for dialog in dialogs
    ]

    responses = []
    for i in range(len(dialogs)):
        batch = batch_inputs[i:i+1]

        encoded_inputs = tokenizer(batch, padding=True, truncation=False, return_tensors="pt")
        input_ids = encoded_inputs["input_ids"].to(model.device)
        attention_mask = encoded_inputs["attention_mask"].to(model.device)

        with torch.no_grad():
            output_tokens = model.generate(
                input_ids,
                attention_mask=attention_mask,
                max_new_tokens=num_new_tokens,
                do_sample=True,
                temperature=temperature,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id
            )

        decoded_outputs = tokenizer.batch_decode(output_tokens, skip_special_tokens=True)

        for j, response in enumerate(decoded_outputs):
            original_dialog = dialogs[i + j]
            formatted_response = format_response(original_dialog, response)
            responses.append(formatted_response)

        torch.cuda.empty_cache()

    results = {
        "Responses": responses
    }

    return results

# Demo function
def demo(num_new_tokens, temperature, num_questions, custom_question):
    dialogs = load_questions("chats_sys_none.json", num_questions, custom_question)
    results = infer("NousResearch/Meta-Llama-3-8B-Instruct", dialogs, num_new_tokens, temperature, "fp16")
    return results

# Load JSON data
with open("chats_sys_none.json", "r") as file:
    json_data = json.load(file)
json_data_str = json.dumps(json_data, indent=2)

# Show JSON function
def show_json():
    return json_data_str

# Gradio interface
interface = gr.Interface(
    fn=demo,
    inputs=[
        gr.Slider(label="Number of New Tokens", minimum=1, maximum=1024, step=1, value=512),
        gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.4),
        gr.Slider(minimum=20, maximum=100, step=1, label="Number of Questions", value=20),
        gr.Textbox(label="Custom Question", placeholder="Type your custom question here..."),
    ],
    outputs=[
        gr.JSON(label="Responses")
    ],
    title="LLM Inference Demo",
    description="A demo for running LLM inference using Gradio and Hugging Face.",
    live=False
)

json_interface = gr.Interface(
    fn=show_json,
    inputs=[],
    outputs=[
        gr.HTML("<pre>{}</pre>".format(json_data_str))
    ],
    live=False
)

app = gr.Blocks()

with app:
    with gr.Tab("LLM Inference Demo"):
        interface.render()
    with gr.Tab("Show JSON"):
        json_interface.render()

if __name__ == "__main__":
    app.launch()