import os
from smolagents import CodeAgent, ToolCallingAgent
from smolagents import OpenAIServerModel
from tools.fetch import fetch_webpage
from tools.yttranscript import get_youtube_transcript, get_youtube_title_description
import myprompts
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

import torch
# --- Basic Agent Definition ---
class BasicAgent:
    def __init__(self):
        print("BasicAgent initialized.")
    def __call__(self, question: str) -> str:

        print(f"Agent received question (first 50 chars): {question[:50]}...")

        try:
            # Use the reviewer agent to determine if the question can be answered by a model or requires code
            print("Calling reviewer agent...")
            reviewer_answer = reviewer_agent.run(myprompts.review_prompt + "\nThe question is:\n" + question)
            print(f"Reviewer agent answer: {reviewer_answer}")

            question = question + '\n' + myprompts.output_format
            fixed_answer = ""

            if reviewer_answer == "code":
                fixed_answer = gaia_agent.run(question)
                print(f"Code agent answer: {fixed_answer}")
                
            elif reviewer_answer == "model":    
                # If the reviewer agent suggests using the model, we can proceed with the model agent
                print("Using model agent to answer the question.")
                fixed_answer = model_agent.run(myprompts.model_prompt + "\nThe question is:\n" + question)
                print(f"Model agent answer: {fixed_answer}")

            return fixed_answer
        except Exception as e:
            error = f"An error occurred while processing the question: {e}"
            print(error)
            return error

        
# Load model and tokenizer
model_id = "LiquidAI/LFM2-1.2B"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="bfloat16",
    trust_remote_code=True,
    # attn_implementation="flash_attention_2"  # <- uncomment on compatible GPU
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Create a wrapper class that matches the expected interface
class LocalLlamaModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = 'cpu'
    
    def generate(self, prompt: str, max_new_tokens=512*5, **kwargs):
        try:
            # Generate answer using the provided prompt - following the recommended pattern
            # input_ids = self.tokenizer.apply_chat_template(
            #     [{"role": "user", "content": str(prompt)}],
            #     add_generation_prompt=True,
            #     return_tensors="pt",
            #     tokenize=True,
            # ).to(self.model.device)

            print("Prompt: ", prompt)
            print("Prompt type: ", type(prompt))
            
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
            
            # Generate output - exactly as in recommended code
            output = self.model.generate(
                input_ids,
                do_sample=True,
                temperature=0.3,
                min_p=0.15,
                repetition_penalty=1.05,
                max_new_tokens=max_new_tokens,
            )
            
            # Decode the full output - as in recommended code
            decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=False)
            
            # Extract only the assistant's response (after the last <|im_start|>assistant)
            if "<|im_start|>assistant" in decoded_output:
                assistant_response = decoded_output.split("<|im_start|>assistant")[-1]
                # Remove any trailing special tokens
                assistant_response = assistant_response.replace("<|im_end|>", "").strip()
                return assistant_response
            else:
                # Fallback: return the full decoded output
                return decoded_output
            
        except Exception as e:
            print(f"Error in model generation: {e}")
            return f"Error generating response: {str(e)}"

       
    def __call__(self, prompt: str, max_new_tokens=512, **kwargs):
        """Make the model callable like a function"""
        return self.generate(prompt, max_new_tokens, **kwargs)

# Create the model instance
wrapped_model = LocalLlamaModel(model, tokenizer)

# Now create your agents - these should work with the wrapped model
reviewer_agent = ToolCallingAgent(model=wrapped_model, tools=[])
model_agent = ToolCallingAgent(model=wrapped_model, tools=[fetch_webpage])
gaia_agent = CodeAgent(
    tools=[fetch_webpage, get_youtube_title_description, get_youtube_transcript],
    model=wrapped_model
)


if __name__ == "__main__":
    # Example usage
    question = "What was the actual enrollment of the Malko competition in 2023?"
    agent = BasicAgent()
    answer = agent(question)
    print(f"Answer: {answer}")