laverdes commited on
Commit
84ac217
·
1 Parent(s): be9a1ee

feat: browser_automation and gpt4o manager

Browse files
Files changed (4) hide show
  1. app.py +128 -26
  2. multiagent_sandbox.py +294 -0
  3. tools/rag_transformers.py +0 -0
  4. vision_web_browser.py +211 -0
app.py CHANGED
@@ -1,19 +1,26 @@
1
- from datetime import datetime
 
 
2
  import pytz
3
  import yaml
4
  import pycountry
 
 
5
 
6
  from tools.final_answer import FinalAnswerTool
7
  from tools.visit_webpage import VisitWebpageTool
8
  from tools.translation import TranslationTool
9
  from tools.best_model_for_task import HFModelDownloadsTool
 
10
 
11
  from transformers import pipeline
12
  from Gradio_UI import GradioUI
13
-
14
- import os
15
- import base64
16
  from dotenv import load_dotenv
 
 
 
 
17
 
18
  from opentelemetry.sdk.trace import TracerProvider
19
  from openinference.instrumentation.smolagents import SmolagentsInstrumentor
@@ -26,25 +33,28 @@ from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
26
  from langchain_core.prompts import PromptTemplate
27
  from langchain_openai import OpenAI
28
 
29
- from skimage import io
30
- from PIL import Image
31
 
 
 
32
  from smolagents import (
33
  CodeAgent,
34
  DuckDuckGoSearchTool,
35
  GoogleSearchTool,
36
  HfApiModel,
37
  TransformersModel,
 
38
  load_tool,
39
  Tool,
40
  tool,
41
- ToolCollection
 
42
  )
43
 
44
  # load .env vars
45
  load_dotenv()
46
 
47
-
48
  # fast prototyping tools
49
  @tool
50
  def get_current_time_in_timezone(timezone: str) -> str:
@@ -107,6 +117,79 @@ def advanced_image_generation(description:str)->Image.Image:
107
  return pil_image
108
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  # telemetry
111
  def initialize_langfuse_opentelemetry_instrumentation():
112
  LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
@@ -128,9 +211,10 @@ final_answer = FinalAnswerTool()
128
  visit_webpage = VisitWebpageTool()
129
  translation = TranslationTool()
130
  best_model_for_task = HFModelDownloadsTool()
 
131
 
132
  # load tools from smoloagents library
133
- google_web_search = GoogleSearchTool()
134
  google_web_search.name = "google_web_search"
135
  duckduckgo_web_search = DuckDuckGoSearchTool()
136
  duckduckgo_web_search.name = "duckduckgo_web_search"
@@ -148,13 +232,7 @@ image_generation_tool_fast = Tool.from_space(
148
  )
149
 
150
 
151
- # alternative hf inference endpoint
152
- model = HfApiModel(
153
- max_tokens=2096,
154
- temperature=0.5,
155
- model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud',
156
- custom_role_conversions=None,
157
- )
158
 
159
  with open("prompts.yaml", 'r') as stream:
160
  prompt_templates = yaml.safe_load(stream)
@@ -165,26 +243,50 @@ tools = [
165
  advanced_search_tool,
166
  google_web_search,
167
  duckduckgo_web_search,
168
- visit_webpage,
 
169
  get_current_time_in_timezone,
170
  advanced_image_generation,
171
  image_generation_tool,
 
172
  language_detection,
173
- translation
 
174
  ]
175
 
176
  agent = CodeAgent(
177
- model=model,
178
  tools=tools,
179
- max_steps=10,
180
- verbosity_level=1,
181
  grammar=None,
182
- planning_interval=None,
183
- name=None,
184
- description=None,
185
- prompt_templates=prompt_templates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  )
187
 
 
 
188
  # agent.push_to_hub('laverdes/Alfredo')
 
 
 
 
189
 
190
- GradioUI(agent).launch()
 
 
1
+ import os
2
+ import base64
3
+ import math
4
  import pytz
5
  import yaml
6
  import pycountry
7
+ import subprocess
8
+ import sys
9
 
10
  from tools.final_answer import FinalAnswerTool
11
  from tools.visit_webpage import VisitWebpageTool
12
  from tools.translation import TranslationTool
13
  from tools.best_model_for_task import HFModelDownloadsTool
14
+ from tools.rag_transformers import retriever_tool
15
 
16
  from transformers import pipeline
17
  from Gradio_UI import GradioUI
18
+ from Gradio_UI_with_image import GradioUIImage
 
 
19
  from dotenv import load_dotenv
20
+ from datetime import datetime
21
+ from skimage import io
22
+ from PIL import Image
23
+ from typing import Optional, Tuple
24
 
25
  from opentelemetry.sdk.trace import TracerProvider
26
  from openinference.instrumentation.smolagents import SmolagentsInstrumentor
 
33
  from langchain_core.prompts import PromptTemplate
34
  from langchain_openai import OpenAI
35
 
36
+ from io import BytesIO
37
+ from time import sleep
38
 
39
+ from smolagents.agents import ActionStep
40
+ from smolagents.cli import load_model
41
  from smolagents import (
42
  CodeAgent,
43
  DuckDuckGoSearchTool,
44
  GoogleSearchTool,
45
  HfApiModel,
46
  TransformersModel,
47
+ OpenAIServerModel,
48
  load_tool,
49
  Tool,
50
  tool,
51
+ ToolCollection,
52
+ E2BExecutor
53
  )
54
 
55
  # load .env vars
56
  load_dotenv()
57
 
 
58
  # fast prototyping tools
59
  @tool
60
  def get_current_time_in_timezone(timezone: str) -> str:
 
117
  return pil_image
118
 
119
 
120
+ @tool
121
+ def calculate_cargo_travel_time(
122
+ origin_coords: Tuple[float, float],
123
+ destination_coords: Tuple[float, float],
124
+ cruising_speed_kmh: Optional[float] = 750.0, # Average speed for cargo planes
125
+ ) -> float:
126
+ """
127
+ Calculate the travel time for a cargo plane between two points on Earth using great-circle distance.
128
+
129
+ Args:
130
+ origin_coords: Tuple of (latitude, longitude) for the starting point
131
+ destination_coords: Tuple of (latitude, longitude) for the destination
132
+ cruising_speed_kmh: Optional cruising speed in km/h (defaults to 750 km/h for typical cargo planes)
133
+
134
+ Returns:
135
+ float: The estimated travel time in hours
136
+
137
+ Example:
138
+ >>> # Chicago (41.8781° N, 87.6298° W) to Sydney (33.8688° S, 151.2093° E)
139
+ >>> result = calculate_cargo_travel_time((41.8781, -87.6298), (-33.8688, 151.2093))
140
+ """
141
+
142
+ def to_radians(degrees: float) -> float:
143
+ return degrees * (math.pi / 180)
144
+
145
+ # Extract coordinates
146
+ lat1, lon1 = map(to_radians, origin_coords)
147
+ lat2, lon2 = map(to_radians, destination_coords)
148
+
149
+ # Earth's radius in kilometers
150
+ EARTH_RADIUS_KM = 6371.0
151
+
152
+ # Calculate great-circle distance using the haversine formula
153
+ dlon = lon2 - lon1
154
+ dlat = lat2 - lat1
155
+
156
+ a = (
157
+ math.sin(dlat / 2) ** 2
158
+ + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
159
+ )
160
+ c = 2 * math.asin(math.sqrt(a))
161
+ distance = EARTH_RADIUS_KM * c
162
+
163
+ # Add 10% to account for non-direct routes and air traffic controls
164
+ actual_distance = distance * 1.1
165
+
166
+ # Calculate flight time
167
+ # Add 1 hour for takeoff and landing procedures
168
+ flight_time = (actual_distance / cruising_speed_kmh) + 1.0
169
+
170
+ # Format the results
171
+ return round(flight_time, 2)
172
+
173
+
174
+ @tool
175
+ def browser_automation(original_user_query:str)->str:
176
+ """
177
+ Browser automation is like “simulating a real user” and works for interactive,
178
+ dynamic sites and when visual navigation is required to show the process to the user.
179
+ Navigates the web using helium to answer a user query by appending helium_instructions to the original query
180
+ by searching for text matches through the navigation.
181
+ Args:
182
+ original_user_query: The original
183
+ """
184
+ # Use sys.executable to ensure the same Python interpreter is used.
185
+ result = subprocess.run(
186
+ [sys.executable, "vision_web_browser.py", original_user_query],
187
+ capture_output=True, # Captures both stdout and stderr
188
+ text=True # Returns output as a string instead of bytes
189
+ )
190
+ print("vision_web_browser.py: ", result.stderr)
191
+ return result.stdout
192
+
193
  # telemetry
194
  def initialize_langfuse_opentelemetry_instrumentation():
195
  LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
 
211
  visit_webpage = VisitWebpageTool()
212
  translation = TranslationTool()
213
  best_model_for_task = HFModelDownloadsTool()
214
+ transformers_retriever = retriever_tool
215
 
216
  # load tools from smoloagents library
217
+ google_web_search = GoogleSearchTool() # provider="serper" (SERPER_API_KEY) or "serpapi" (default)
218
  google_web_search.name = "google_web_search"
219
  duckduckgo_web_search = DuckDuckGoSearchTool()
220
  duckduckgo_web_search.name = "duckduckgo_web_search"
 
232
  )
233
 
234
 
235
+ ceo_model = load_model("LiteLLMModel", "gpt-4o") # or anthropic/claude-3-sonnet
 
 
 
 
 
 
236
 
237
  with open("prompts.yaml", 'r') as stream:
238
  prompt_templates = yaml.safe_load(stream)
 
243
  advanced_search_tool,
244
  google_web_search,
245
  duckduckgo_web_search,
246
+ visit_webpage,
247
+ browser_automation,
248
  get_current_time_in_timezone,
249
  advanced_image_generation,
250
  image_generation_tool,
251
+ transformers_retriever,
252
  language_detection,
253
+ translation,
254
+ calculate_cargo_travel_time
255
  ]
256
 
257
  agent = CodeAgent(
258
+ model=ceo_model,
259
  tools=tools,
260
+ max_steps=20, # 15 is good for a light manager, too much when there is no need of a manager
261
+ verbosity_level=2,
262
  grammar=None,
263
+ planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager)
264
+ name="Alfredo",
265
+ description="CEO",
266
+ prompt_templates=prompt_templates,
267
+ # executor_type="e2b", # security, could also be "docker" (set keys)
268
+ # sandbox=E2BSandbox() (or E2BExecutor?),
269
+ # step_callbacks=[save_screenshot], # todo: configure the web_navigation agent as a separate agent and mangage it with alfred
270
+ additional_authorized_imports=[
271
+ "geopandas",
272
+ "plotly",
273
+ "shapely",
274
+ "json",
275
+ "pandas",
276
+ "numpy",
277
+ "requests",
278
+ "helium",
279
+ ],
280
+ # I could also add the authorized_imports from a LIST_SAFE_MODULES
281
  )
282
 
283
+ agent.python_executor("from helium import *") # agent.state
284
+
285
  # agent.push_to_hub('laverdes/Alfredo')
286
+ agent.visualize()
287
+
288
+ # prompt = ("navigate to a random wikipedia page and give me a summary of the content, then make a single image representing all the content")
289
+ # agent.run(prompt)
290
 
291
+ GradioUI(agent).launch()
292
+ #GradioUIImage(agent).launch()
multiagent_sandbox.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from e2b_code_interpreter import Sandbox
2
+
3
+ secure_sandbox = Sandbox()
4
+
5
+ secure_sandbox.commands.run("pip install smolagents")
6
+
7
+ def run_code_raise_errors(secure_sandbox, code: str, verbose: bool = False) -> str:
8
+ execution = secure_sandbox.run_code(
9
+ code,
10
+ envs={'HF_TOKEN': os.getenv('HF_TOKEN')}
11
+ )
12
+ if execution.error:
13
+ execution_logs = "\n".join([str(log) for log in execution.logs.stdout])
14
+ logs = execution_logs
15
+ logs += execution.error.traceback
16
+ raise ValueError(logs)
17
+ return "\n".join([str(log) for log in execution.logs.stdout])
18
+
19
+ alfredo_code = """
20
+ import os
21
+ import base64
22
+ import math
23
+ import pytz
24
+ import yaml
25
+ import pycountry
26
+
27
+ from tools.final_answer import FinalAnswerTool
28
+ from tools.visit_webpage import VisitWebpageTool
29
+ from tools.translation import TranslationTool
30
+ from tools.best_model_for_task import HFModelDownloadsTool
31
+ from tools.rag_transformers import retriever_tool
32
+
33
+ from transformers import pipeline
34
+ from Gradio_UI import GradioUI
35
+ from Gradio_UI_with_image import GradioUIImage
36
+ from dotenv import load_dotenv
37
+ from datetime import datetime
38
+ from skimage import io
39
+ from PIL import Image
40
+ from typing import Optional, Tuple
41
+
42
+ from opentelemetry.sdk.trace import TracerProvider
43
+ from openinference.instrumentation.smolagents import SmolagentsInstrumentor
44
+ from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
45
+ from opentelemetry.sdk.trace.export import SimpleSpanProcessor
46
+
47
+ from langchain_community.agent_toolkits.load_tools import load_tools
48
+ from langchain.chains import LLMChain
49
+ from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
50
+ from langchain_core.prompts import PromptTemplate
51
+ from langchain_openai import OpenAI
52
+ from smolagents import (
53
+ CodeAgent,
54
+ DuckDuckGoSearchTool,
55
+ GoogleSearchTool,
56
+ HfApiModel,
57
+ TransformersModel,
58
+ OpenAIServerModel,
59
+ load_tool,
60
+ Tool,
61
+ tool,
62
+ ToolCollection
63
+ )
64
+
65
+ # load .env vars
66
+ load_dotenv()
67
+
68
+
69
+
70
+ # fast prototyping tools
71
+ @tool
72
+ def get_current_time_in_timezone(timezone: str) -> str:
73
+ """A tool that fetches the current local time in a specified timezone formatted as '%m/%d/%y %H:%M:%S'
74
+ Args:
75
+ timezone (str): A string representing a valid timezone (e.g., 'America/New_York').
76
+ """
77
+ try:
78
+ tz = pytz.timezone(timezone)
79
+ local_time = datetime.now(tz).strftime('%m/%d/%y %H:%M:%S')
80
+ return f"The current local time in {timezone} is: {local_time}"
81
+ except Exception as e:
82
+ return f"Error fetching time for timezone '{timezone}': {str(e)}"
83
+
84
+
85
+ @tool
86
+ def language_detection(text:str)-> str:
87
+ """Detects the language of the input text using basic xlm-roberta-base-language-detection.
88
+ Args:
89
+ text: the input message or wording to detect language from.
90
+ """
91
+ model_ckpt = "papluca/xlm-roberta-base-language-detection"
92
+ pipe = pipeline("text-classification", model=model_ckpt)
93
+ preds = pipe(text, return_all_scores=True, truncation=True, max_length=128)
94
+ if preds:
95
+ pred = preds[0]
96
+ language_probabilities_dict = {p["label"]: float(p["score"]) for p in pred}
97
+ predicted_language_code = max(language_probabilities_dict, key=language_probabilities_dict.get)
98
+ tool_prediction_confidence = language_probabilities_dict[predicted_language_code]
99
+ confidence_str = f"Tool Confidence: {tool_prediction_confidence}"
100
+ predicted_language_code_str = f"Predicted language code (ISO 639): {predicted_language_code}/n{confidence_str}"
101
+ try:
102
+ predicted_language = pycountry.languages.get(alpha_2=predicted_language_code)
103
+ if predicted_language:
104
+ predicted_language_str = f"Predicted language: {predicted_language.name}/n{confidence_str}"
105
+ return predicted_language_str
106
+ return predicted_language_code_str
107
+
108
+ except Exception as e:
109
+ return f"Error mapping country code to name (pycountry): {str(e)}/n{predicted_language_code_str}"
110
+ else:
111
+ return "None"
112
+
113
+
114
+ @tool
115
+ def advanced_image_generation(description:str)->Image.Image:
116
+ """Generates an image using a textual description.
117
+ Args:
118
+ description: the textual description provided by the user to prompt a text-to-image model
119
+ """
120
+ llm = OpenAI(temperature=0.9)
121
+ prompt = PromptTemplate(
122
+ input_variables=["image_desc"],
123
+ template="Generate a detailed but short prompt (must be less than 900 characters) to generate an image based on the following description: {image_desc}",
124
+ )
125
+ chain = LLMChain(llm=llm, prompt=prompt)
126
+ image_url = DallEAPIWrapper().run(chain.run(description))
127
+ image_array = io.imread(image_url)
128
+ pil_image = Image.fromarray(image_array)
129
+ return pil_image
130
+
131
+
132
+ @tool
133
+ def calculate_cargo_travel_time(
134
+ origin_coords: Tuple[float, float],
135
+ destination_coords: Tuple[float, float],
136
+ cruising_speed_kmh: Optional[float] = 750.0, # Average speed for cargo planes
137
+ ) -> float:
138
+ """
139
+ Calculate the travel time for a cargo plane between two points on Earth using great-circle distance.
140
+
141
+ Args:
142
+ origin_coords: Tuple of (latitude, longitude) for the starting point
143
+ destination_coords: Tuple of (latitude, longitude) for the destination
144
+ cruising_speed_kmh: Optional cruising speed in km/h (defaults to 750 km/h for typical cargo planes)
145
+
146
+ Returns:
147
+ float: The estimated travel time in hours
148
+
149
+ Example:
150
+ >>> # Chicago (41.8781° N, 87.6298° W) to Sydney (33.8688° S, 151.2093° E)
151
+ >>> result = calculate_cargo_travel_time((41.8781, -87.6298), (-33.8688, 151.2093))
152
+ """
153
+
154
+ def to_radians(degrees: float) -> float:
155
+ return degrees * (math.pi / 180)
156
+
157
+ # Extract coordinates
158
+ lat1, lon1 = map(to_radians, origin_coords)
159
+ lat2, lon2 = map(to_radians, destination_coords)
160
+
161
+ # Earth's radius in kilometers
162
+ EARTH_RADIUS_KM = 6371.0
163
+
164
+ # Calculate great-circle distance using the haversine formula
165
+ dlon = lon2 - lon1
166
+ dlat = lat2 - lat1
167
+
168
+ a = (
169
+ math.sin(dlat / 2) ** 2
170
+ + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
171
+ )
172
+ c = 2 * math.asin(math.sqrt(a))
173
+ distance = EARTH_RADIUS_KM * c
174
+
175
+ # Add 10% to account for non-direct routes and air traffic controls
176
+ actual_distance = distance * 1.1
177
+
178
+ # Calculate flight time
179
+ # Add 1 hour for takeoff and landing procedures
180
+ flight_time = (actual_distance / cruising_speed_kmh) + 1.0
181
+
182
+ # Format the results
183
+ return round(flight_time, 2)
184
+
185
+
186
+ # telemetry
187
+ def initialize_langfuse_opentelemetry_instrumentation():
188
+ LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
189
+ LANGFUSE_SECRET_KEY=os.environ.get("LANGFUSE_SECRET_KEY")
190
+ LANGFUSE_AUTH=base64.b64encode(f"{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}".encode()).decode()
191
+
192
+ os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "https://cloud.langfuse.com/api/public/otel" # EU data region
193
+ os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic {LANGFUSE_AUTH}"
194
+
195
+ trace_provider = TracerProvider()
196
+ trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
197
+
198
+ SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
199
+
200
+ initialize_langfuse_opentelemetry_instrumentation()
201
+
202
+ # load tools from /tools/
203
+ final_answer = FinalAnswerTool()
204
+ visit_webpage = VisitWebpageTool()
205
+ translation = TranslationTool()
206
+ best_model_for_task = HFModelDownloadsTool()
207
+ transformers_retriever = retriever_tool
208
+
209
+ # load tools from smoloagents library
210
+ google_web_search = GoogleSearchTool() # provider="serper" (SERPER_API_KEY) or "serpapi" (default)
211
+ google_web_search.name = "google_web_search"
212
+ duckduckgo_web_search = DuckDuckGoSearchTool()
213
+ duckduckgo_web_search.name = "duckduckgo_web_search"
214
+
215
+ # load tools from hub and langchain
216
+ # image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
217
+ image_generation_tool = load_tool("m-ric/text-to-image", trust_remote_code=True) # Tool.from_space("black-forest-labs/FLUX.1-schnell", name="image_generator", description="Generate an image from a prompt")
218
+ advanced_search_tool = Tool.from_langchain(load_tools(["searchapi"], allow_dangerous_tools=True)[0]) # serpapi is not real time scrapping
219
+ advanced_search_tool.name = "advanced_search_tool"
220
+
221
+ image_generation_tool_fast = Tool.from_space(
222
+ "black-forest-labs/FLUX.1-schnell",
223
+ name="image_generator",
224
+ description="Generate an image from a prompt"
225
+ )
226
+
227
+
228
+ # alternative hf inference endpoint
229
+ """
230
+ model = HfApiModel(
231
+ max_tokens=2096, # 8096 for manager
232
+ temperature=0.5,
233
+ model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # same as Qwen/Qwen2.5-Coder-32B-Instruct
234
+ custom_role_conversions=None,
235
+ )
236
+ """
237
+ # also "deepseek-ai/DeepSeek-R1", # and provider="together" (get API key)
238
+ ceo_model = OpenAIServerModel(
239
+ max_tokens=8096, # 2096 or 5000 for other ligher agents (depending on the task)
240
+ temperature=0.5,
241
+ model_id="gpt-4o"
242
+ )
243
+
244
+ with open("prompts.yaml", 'r') as stream:
245
+ prompt_templates = yaml.safe_load(stream)
246
+
247
+ tools = [
248
+ final_answer,
249
+ best_model_for_task,
250
+ advanced_search_tool,
251
+ google_web_search,
252
+ duckduckgo_web_search,
253
+ visit_webpage,
254
+ get_current_time_in_timezone,
255
+ advanced_image_generation,
256
+ image_generation_tool,
257
+ transformers_retriever,
258
+ language_detection,
259
+ translation,
260
+ calculate_cargo_travel_time
261
+ ]
262
+
263
+ agent = CodeAgent(
264
+ model=ceo_model,
265
+ tools=tools,
266
+ max_steps=15, # 15 is good for a light manager, too much when there is no need of a manager
267
+ verbosity_level=2,
268
+ grammar=None,
269
+ planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager)
270
+ name="Alfredo",
271
+ description="CEO",
272
+ prompt_templates=prompt_templates,
273
+ additional_authorized_imports=[
274
+ "geopandas",
275
+ "plotly",
276
+ "shapely",
277
+ "json",
278
+ "pandas",
279
+ "numpy",
280
+ "requests"
281
+ ],
282
+ )
283
+
284
+ # agent.push_to_hub('laverdes/Alfredo')
285
+ agent.visualize()
286
+
287
+ GradioUI(agent).launch()
288
+ #GradioUIImage(agent).launch()
289
+ """
290
+ execution_logs = run_code_raise_errors(secure_sandbox, agent_code)
291
+ print(execution_logs)
292
+
293
+ # todo: clean errors
294
+ # todo: the sandbox is to use in a single execution, not gradio and not receiving real-time user input()
tools/rag_transformers.py ADDED
File without changes
vision_web_browser.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from io import BytesIO
3
+ from time import sleep
4
+ import time
5
+
6
+ import helium
7
+ import PIL.Image
8
+ from dotenv import load_dotenv
9
+ from selenium import webdriver
10
+ from selenium.webdriver.common.by import By
11
+ from selenium.webdriver.common.keys import Keys
12
+
13
+ from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
14
+ from smolagents.agents import ActionStep
15
+ from smolagents.cli import load_model
16
+
17
+
18
+ github_request = """
19
+ I'm trying to find how hard I have to work to get a repo in github.com/trending.
20
+ Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
21
+ """ # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet.
22
+
23
+ search_request = """
24
+ Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
25
+ """
26
+
27
+
28
+ def parse_arguments():
29
+ parser = argparse.ArgumentParser(description="Run a web browser automation script with a specified model.")
30
+ parser.add_argument(
31
+ "prompt",
32
+ type=str,
33
+ nargs="?", # Makes it optional
34
+ default=search_request,
35
+ help="The prompt to run with the agent",
36
+ )
37
+ parser.add_argument(
38
+ "--model-type",
39
+ type=str,
40
+ default="LiteLLMModel",
41
+ help="The model type to use (e.g., OpenAIServerModel, LiteLLMModel, TransformersModel, HfApiModel)",
42
+ )
43
+ parser.add_argument(
44
+ "--model-id",
45
+ type=str,
46
+ default="gpt-4o",
47
+ help="The model ID to use for the specified model type",
48
+ )
49
+ return parser.parse_args()
50
+
51
+
52
+ def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
53
+ sleep(1.0) # Let JavaScript animations happen before taking the screenshot
54
+ driver = helium.get_driver()
55
+ current_step = memory_step.step_number
56
+ if driver is not None:
57
+ for previous_memory_step in agent.memory.steps: # Remove previous screenshots from logs for lean processing
58
+ if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
59
+ previous_memory_step.observations_images = None
60
+ png_bytes = driver.get_screenshot_as_png()
61
+ image = PIL.Image.open(BytesIO(png_bytes))
62
+ print(f"Captured a browser screenshot: {image.size} pixels")
63
+ memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
64
+
65
+ # Update observations with current URL
66
+ url_info = f"Current url: {driver.current_url}"
67
+ memory_step.observations = (
68
+ url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
69
+ )
70
+ return
71
+
72
+
73
+ @tool
74
+ def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
75
+ """
76
+ Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
77
+ Args:
78
+ text: The text to search for
79
+ nth_result: Which occurrence to jump to (default: 1)
80
+ """
81
+ elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
82
+ if nth_result > len(elements):
83
+ raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
84
+ result = f"Found {len(elements)} matches for '{text}'."
85
+ elem = elements[nth_result - 1]
86
+ driver.execute_script("arguments[0].scrollIntoView(true);", elem)
87
+ result += f"Focused on element {nth_result} of {len(elements)}"
88
+ return result
89
+
90
+
91
+ @tool
92
+ def go_back() -> None:
93
+ """Goes back to previous page."""
94
+ driver.back()
95
+
96
+
97
+ @tool
98
+ def close_popups() -> str:
99
+ """
100
+ Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
101
+ """
102
+ webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
103
+
104
+
105
+ def initialize_driver():
106
+ """Initialize the Selenium WebDriver."""
107
+ chrome_options = webdriver.ChromeOptions()
108
+ chrome_options.add_argument("--force-device-scale-factor=1")
109
+ chrome_options.add_argument("--window-size=1000,1350")
110
+ chrome_options.add_argument("--disable-pdf-viewer")
111
+ chrome_options.add_argument("--window-position=0,0")
112
+ return helium.start_chrome(headless=False, options=chrome_options)
113
+
114
+
115
+ def initialize_agent(model):
116
+ """Initialize the CodeAgent with the specified model."""
117
+ return CodeAgent(
118
+ tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
119
+ model=model,
120
+ additional_authorized_imports=["helium"],
121
+ step_callbacks=[save_screenshot],
122
+ max_steps=20,
123
+ verbosity_level=2,
124
+ )
125
+
126
+
127
+ helium_instructions = """
128
+ Use your web_search tool when you want to get Google search results.
129
+ Then you can use helium to access websites. Don't use helium for Google search, only for navigating websites!
130
+ Don't bother about the helium driver, it's already managed.
131
+ We've already ran "from helium import *"
132
+ Then you can go to pages!
133
+ Code:
134
+ ```py
135
+ go_to('github.com/trending')
136
+ ```<end_code>
137
+
138
+ You can directly click clickable elements by inputting the text that appears on them.
139
+ Code:
140
+ ```py
141
+ click("Top products")
142
+ ```<end_code>
143
+
144
+ If it's a link:
145
+ Code:
146
+ ```py
147
+ click(Link("Top products"))
148
+ ```<end_code>
149
+
150
+ If you try to interact with an element and it's not found, you'll get a LookupError.
151
+ In general stop your action after each button click to see what happens on your screenshot.
152
+ Never try to login in a page.
153
+
154
+ To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
155
+ Code:
156
+ ```py
157
+ scroll_down(num_pixels=1200) # This will scroll one viewport down
158
+ ```<end_code>
159
+
160
+ When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
161
+ Just use your built-in tool `close_popups` to close them:
162
+ Code:
163
+ ```py
164
+ close_popups()
165
+ ```<end_code>
166
+
167
+ You can use .exists() to check for the existence of an element. For example:
168
+ Code:
169
+ ```py
170
+ if Text('Accept cookies?').exists():
171
+ click('I accept')
172
+ ```<end_code>
173
+
174
+ Proceed in several steps rather than trying to solve the task in one shot.
175
+ And at the end, only when you have your answer, return your final answer.
176
+ Code:
177
+ ```py
178
+ final_answer("YOUR_ANSWER_HERE")
179
+ ```<end_code>
180
+
181
+ If pages seem stuck on loading, you might have to wait, for instance `import time` and run `time.sleep(5.0)`. But don't overuse this!
182
+ To list elements on page, DO NOT try code-based element searches like 'contributors = find_all(S("ol > li"))': just look at the latest screenshot you have and read it visually, or use your tool search_item_ctrl_f.
183
+ Of course, you can act on buttons like a user would do when navigating.
184
+ After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url.
185
+ But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states.
186
+ Don't kill the browser.
187
+ When you have modals or cookie banners on screen, you should get rid of them before you can click anything else.
188
+ """
189
+
190
+
191
+ def main(prompt: str, model_type: str, model_id: str) -> None:
192
+ # Load environment variables
193
+ load_dotenv()
194
+
195
+ # Initialize the model based on the provided arguments
196
+ model = load_model(model_type, model_id)
197
+
198
+ global driver
199
+ driver = initialize_driver()
200
+ agent = initialize_agent(model)
201
+
202
+ # Run the agent with the provided prompt
203
+ agent.python_executor("from helium import *")
204
+ agent.run(prompt + helium_instructions)
205
+
206
+
207
+ if __name__ == "__main__":
208
+ # Parse command line arguments
209
+ args = parse_arguments()
210
+
211
+ main(args.prompt, args.model_type, args.model_id)