import os from smolagents import CodeAgent, ToolCallingAgent from smolagents import OpenAIServerModel from tools.fetch import fetch_webpage from tools.yttranscript import get_youtube_transcript, get_youtube_title_description import myprompts from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline import torch # --- Basic Agent Definition --- class BasicAgent: def __init__(self): print("BasicAgent initialized.") def __call__(self, question: str) -> str: print(f"Agent received question (first 50 chars): {question[:50]}...") try: # Use the reviewer agent to determine if the question can be answered by a model or requires code print("Calling reviewer agent...") reviewer_answer = reviewer_agent.run(myprompts.review_prompt + "\nThe question is:\n" + question) print(f"Reviewer agent answer: {reviewer_answer}") question = question + '\n' + myprompts.output_format fixed_answer = "" if reviewer_answer == "code": fixed_answer = gaia_agent.run(question) print(f"Code agent answer: {fixed_answer}") elif reviewer_answer == "model": # If the reviewer agent suggests using the model, we can proceed with the model agent print("Using model agent to answer the question.") fixed_answer = model_agent.run(myprompts.model_prompt + "\nThe question is:\n" + question) print(f"Model agent answer: {fixed_answer}") return fixed_answer except Exception as e: error = f"An error occurred while processing the question: {e}" print(error) return error # Load model and tokenizer model_id = "LiquidAI/LFM2-1.2B" model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype="bfloat16", trust_remote_code=True, # attn_implementation="flash_attention_2" # <- uncomment on compatible GPU ) tokenizer = AutoTokenizer.from_pretrained(model_id) # Create a wrapper class that matches the expected interface class LocalLlamaModel: def __init__(self, model, tokenizer): self.model = model self.tokenizer = tokenizer self.device = 'cpu' def generate(self, prompt: str, max_new_tokens=512*5, **kwargs): try: # Generate answer using the provided prompt - following the recommended pattern # input_ids = self.tokenizer.apply_chat_template( # [{"role": "user", "content": str(prompt)}], # add_generation_prompt=True, # return_tensors="pt", # tokenize=True, # ).to(self.model.device) print("Prompt: ", prompt) print("Prompt type: ", type(prompt)) inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) # Generate output - exactly as in recommended code output = self.model.generate( input_ids, do_sample=True, temperature=0.3, min_p=0.15, repetition_penalty=1.05, max_new_tokens=max_new_tokens, ) # Decode the full output - as in recommended code decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=False) # Extract only the assistant's response (after the last <|im_start|>assistant) if "<|im_start|>assistant" in decoded_output: assistant_response = decoded_output.split("<|im_start|>assistant")[-1] # Remove any trailing special tokens assistant_response = assistant_response.replace("<|im_end|>", "").strip() return assistant_response else: # Fallback: return the full decoded output return decoded_output except Exception as e: print(f"Error in model generation: {e}") return f"Error generating response: {str(e)}" def __call__(self, prompt: str, max_new_tokens=512, **kwargs): """Make the model callable like a function""" return self.generate(prompt, max_new_tokens, **kwargs) # Create the model instance wrapped_model = LocalLlamaModel(model, tokenizer) # Now create your agents - these should work with the wrapped model reviewer_agent = ToolCallingAgent(model=wrapped_model, tools=[]) model_agent = ToolCallingAgent(model=wrapped_model, tools=[fetch_webpage]) gaia_agent = CodeAgent( tools=[fetch_webpage, get_youtube_title_description, get_youtube_transcript], model=wrapped_model ) if __name__ == "__main__": # Example usage question = "What was the actual enrollment of the Malko competition in 2023?" agent = BasicAgent() answer = agent(question) print(f"Answer: {answer}")