import gradio as gr
import torch
import spaces
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Define the model name
model_name = "CreitinGameplays/ConvAI-9b"

# Quantization configuration with bitsandbytes settings
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, low_cpu_mem_usage=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#model.to(device)

# Initialize chat history
chat_history = []

@spaces.GPU(duration=120)
def generate_text(user_prompt, top_p, top_k, temperature):
    """Generates text using the ConvAI model from Hugging Face Transformers and maintains conversation history."""
    # System introduction
    system = "You are a helpful AI language model called ChatGPT, your goal is helping users with their questions."

    # Append user prompt to chat history
    chat_history.append(f"User: {user_prompt}")

    # Construct the full prompt with system introduction, user prompt, and assistant role
    prompt = f"{system} </s> {' '.join(chat_history)} </s>"

    # Encode the entire prompt into tokens
    prompt_encoded = tokenizer.encode(prompt, return_tensors="pt").to(device)

    # Generate text with the complete prompt and limit the maximum length to 256 tokens
    output = model.generate(
        input_ids=prompt_encoded,
        max_length=1550,
        num_beams=1,
        num_return_sequences=1,
        do_sample=True,
        top_k=top_k,
        top_p=top_p,
        temperature=temperature,
        repetition_penalty=1.2
    )

    # Decode the generated token sequence back to text
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the assistant's response
    assistant_response = generated_text.split("User:")[-1].strip()
    chat_history.append(f"Assistant: {assistant_response}")

    return "\n".join(chat_history)

def reset_history():
  global chat_history
  chat_history = []
  return "Chat history reset."

# Define the Gradio interface
interface = gr.Interface(
    fn=generate_text,
    inputs=[
        gr.Textbox(label="Text Prompt", value="What's an AI?"),
        gr.Slider(0, 1, value=0.9, label="Top-p"),
        gr.Slider(1, 100, value=50, step=1, label="Top-k"),
        gr.Slider(0.01, 1, value=0.2, label="Temperature")
    ],
    outputs="text",
    description="Interact with ConvAI (Loaded with Hugging Face Transformers)",
    button = gr.Button(label="Reset Chat History"),
    live=True
)
interface.update(elem_id=button.elem_id, value=reset_history),
# Launch the Gradio interface
interface.launch()