File size: 5,005 Bytes
ed9acbe
 
 
 
 
 
9f08c4f
455866a
9f08c4f
ed9acbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391fe34
a6d2c9e
 
 
 
 
 
 
e4f7d1f
a6d2c9e
 
391fe34
a94813f
 
 
 
 
84b2d97
a94813f
78f6f4d
55b792c
 
3d16b77
 
 
 
 
 
78f6f4d
 
 
3d16b77
 
55b792c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a94813f
 
 
ed9acbe
a94813f
e4f7d1f
ed9acbe
a94813f
e4f7d1f
 
a94813f
e4f7d1f
 
a94813f
ed9acbe
e4f7d1f
 
ed9acbe
 
 
 
 
7acb2e7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import os
from smolagents import CodeAgent, ToolCallingAgent
from smolagents import OpenAIServerModel
from tools.fetch import fetch_webpage
from tools.yttranscript import get_youtube_transcript, get_youtube_title_description
import myprompts
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

import torch
# --- Basic Agent Definition ---
class BasicAgent:
    def __init__(self):
        print("BasicAgent initialized.")
    def __call__(self, question: str) -> str:

        print(f"Agent received question (first 50 chars): {question[:50]}...")

        try:
            # Use the reviewer agent to determine if the question can be answered by a model or requires code
            print("Calling reviewer agent...")
            reviewer_answer = reviewer_agent.run(myprompts.review_prompt + "\nThe question is:\n" + question)
            print(f"Reviewer agent answer: {reviewer_answer}")

            question = question + '\n' + myprompts.output_format
            fixed_answer = ""

            if reviewer_answer == "code":
                fixed_answer = gaia_agent.run(question)
                print(f"Code agent answer: {fixed_answer}")
                
            elif reviewer_answer == "model":    
                # If the reviewer agent suggests using the model, we can proceed with the model agent
                print("Using model agent to answer the question.")
                fixed_answer = model_agent.run(myprompts.model_prompt + "\nThe question is:\n" + question)
                print(f"Model agent answer: {fixed_answer}")

            return fixed_answer
        except Exception as e:
            error = f"An error occurred while processing the question: {e}"
            print(error)
            return error

        

# Load model and tokenizer
model_id = "LiquidAI/LFM2-1.2B"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="bfloat16",
    trust_remote_code=True,
    # attn_implementation="flash_attention_2"  # <- uncomment on compatible GPU
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Create a wrapper class that matches the expected interface
class LocalLlamaModel:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = 'cpu'
    
    def generate(self, prompt: str, max_new_tokens=512*5, **kwargs):
        try:
            # Generate answer using the provided prompt - following the recommended pattern
            # input_ids = self.tokenizer.apply_chat_template(
            #     [{"role": "user", "content": str(prompt)}],
            #     add_generation_prompt=True,
            #     return_tensors="pt",
            #     tokenize=True,
            # ).to(self.model.device)

            print("Prompt: ", prompt)
            print("Prompt type: ", type(prompt))
            
            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
            
            # Generate output - exactly as in recommended code
            output = self.model.generate(
                input_ids,
                do_sample=True,
                temperature=0.3,
                min_p=0.15,
                repetition_penalty=1.05,
                max_new_tokens=max_new_tokens,
            )
            
            # Decode the full output - as in recommended code
            decoded_output = self.tokenizer.decode(output[0], skip_special_tokens=False)
            
            # Extract only the assistant's response (after the last <|im_start|>assistant)
            if "<|im_start|>assistant" in decoded_output:
                assistant_response = decoded_output.split("<|im_start|>assistant")[-1]
                # Remove any trailing special tokens
                assistant_response = assistant_response.replace("<|im_end|>", "").strip()
                return assistant_response
            else:
                # Fallback: return the full decoded output
                return decoded_output
            
        except Exception as e:
            print(f"Error in model generation: {e}")
            return f"Error generating response: {str(e)}"

       
    def __call__(self, prompt: str, max_new_tokens=512, **kwargs):
        """Make the model callable like a function"""
        return self.generate(prompt, max_new_tokens, **kwargs)

# Create the model instance
wrapped_model = LocalLlamaModel(model, tokenizer)

# Now create your agents - these should work with the wrapped model
reviewer_agent = ToolCallingAgent(model=wrapped_model, tools=[])
model_agent = ToolCallingAgent(model=wrapped_model, tools=[fetch_webpage])
gaia_agent = CodeAgent(
    tools=[fetch_webpage, get_youtube_title_description, get_youtube_transcript],
    model=wrapped_model
)



if __name__ == "__main__":
    # Example usage
    question = "What was the actual enrollment of the Malko competition in 2023?"
    agent = BasicAgent()
    answer = agent(question)
    print(f"Answer: {answer}")