import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import json
import warnings
from pydantic import BaseModel
from typing import Dict
import spaces

device = "cuda"
# Ignore warnings
warnings.filterwarnings(action='ignore')

# Set random seed
torch.random.manual_seed(0)

# Define model path and generation arguments
model_path = "microsoft/Phi-3-mini-4k-instruct"
generation_args = {
    "max_new_tokens": 50,
    "return_full_text": False,
    "temperature": 0.1,
    "do_sample": True
}

# Load the model and pipeline once and keep it in memory
def load_model_pipeline(model_path: str):
    if not hasattr(load_model_pipeline, "pipe"):
        model = AutoModelForCausalLM.from_pretrained(
            model_path,
            device_map=device,
            torch_dtype="auto",
            trust_remote_code=True,
        )
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        load_model_pipeline.pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    return load_model_pipeline.pipe

# Initialize the pipeline and keep it in memory
pipe = load_model_pipeline(model_path)

# Generate logic from LLM output
@spaces.GPU(duration=50)
def generate_logic(llm_output: str) -> str:
    prompt = f"""
    Provide the response in json string for the below keys and context based on the description: '{llm_output}'.
    
    Screen.interaction_yes: This field indicates whether there was an interaction of the person with a screen during the activity. A value of 1 means there was screen interaction (Yes), and a value of 0 means there was no screen interaction (No).
    Hands.free: This field indicates whether the person's hands were free during the activity. A value of 1 means the person was not holding anything (Yes), indicating free hands. A value of 0 means the person was holding something (No), indicating the hands were not free.
    Indoors: This field indicates whether the activity took place indoors. A value of 1 means the activity occurred inside a building or enclosed space (Yes), and a value of 0 means the activity took place outside (No).
    Standing: This field indicates whether the person was standing during the activity. A value of 1 means the person was standing (Yes), and a value of 0 means the person was not standing (No).
    """

    messages = [
        {"role": "system", "content": "Please answer questions just based on this information: " + llm_output},
        {"role": "user", "content": prompt},
    ]

    response = pipe(messages, **generation_args)
    generated_text = response[0]['generated_text']

    # Extract JSON from the generated text
    start_index = generated_text.find('{')
    end_index = generated_text.rfind('}') + 1
    json_str = generated_text[start_index:end_index]
    
    # Log the generated JSON string for debugging
    print(f"Generated JSON: {json_str}")

    if not json_str.strip():
        raise ValueError("Generated logic is empty or invalid JSON")
    
    return json_str

# Pydantic model for structured output
class VideoAnalysis(BaseModel):
    screen_interaction_yes: int
    hands_free: int
    indoors: int
    standing: int

    @classmethod
    def from_llm_output(cls, generated_logic: str) -> 'VideoAnalysis':
        try:
            logic_dict = json.loads(generated_logic)
        except json.JSONDecodeError as e:
            raise ValueError(f"Error decoding JSON: {e}") from e
        
        return cls(
            screen_interaction_yes=logic_dict.get("Screen.interaction_yes", 0),
            hands_free=logic_dict.get("Hands.free", 0),
            indoors=logic_dict.get("Indoors", 0),
            standing=logic_dict.get("Standing", 0)
        )

# Main function to process LLM output
def process_description(description: str) -> Dict:
    generated_logic = generate_logic(description)
    structured_output = VideoAnalysis.from_llm_output(generated_logic)
    return structured_output.dict()