import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = None
tokenizer = None
device = None

def init():
    """
    The init function is called once at startup to load the model into memory.
    """
    global model, tokenizer, device
    
    # Replace this with your model repository ID
    model_name_or_path = "0xroyce/NazareAI-Senior-Marketing-Strategist"
    
    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

    # Load the model
    model = AutoModelForCausalLM.from_pretrained(
        model_name_or_path,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        low_cpu_mem_usage=True
    )
    
    # Set up the device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    # Store in global variables
    globals()["model"] = model
    globals()["tokenizer"] = tokenizer
    globals()["device"] = device


def inference(model_inputs: dict) -> dict:
    """
    This function is called for every request.
    The input is a dictionary with a 'prompt' key.
    The output is a dictionary with 'generated_text'.
    """
    global model, tokenizer, device

    # Get the prompt from the input
    prompt = model_inputs.get("prompt", "")
    if not prompt:
        return {"error": "No prompt provided."}

    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Run generation
    output_ids = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True,
        top_p=0.9,
        temperature=0.7
    )

    # Decode the output
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return {"generated_text": output_text}