feat: browser_automation and gpt4o manager
Browse files- app.py +128 -26
- multiagent_sandbox.py +294 -0
- tools/rag_transformers.py +0 -0
- vision_web_browser.py +211 -0
app.py
CHANGED
@@ -1,19 +1,26 @@
|
|
1 |
-
|
|
|
|
|
2 |
import pytz
|
3 |
import yaml
|
4 |
import pycountry
|
|
|
|
|
5 |
|
6 |
from tools.final_answer import FinalAnswerTool
|
7 |
from tools.visit_webpage import VisitWebpageTool
|
8 |
from tools.translation import TranslationTool
|
9 |
from tools.best_model_for_task import HFModelDownloadsTool
|
|
|
10 |
|
11 |
from transformers import pipeline
|
12 |
from Gradio_UI import GradioUI
|
13 |
-
|
14 |
-
import os
|
15 |
-
import base64
|
16 |
from dotenv import load_dotenv
|
|
|
|
|
|
|
|
|
17 |
|
18 |
from opentelemetry.sdk.trace import TracerProvider
|
19 |
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
|
@@ -26,25 +33,28 @@ from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
|
|
26 |
from langchain_core.prompts import PromptTemplate
|
27 |
from langchain_openai import OpenAI
|
28 |
|
29 |
-
from
|
30 |
-
from
|
31 |
|
|
|
|
|
32 |
from smolagents import (
|
33 |
CodeAgent,
|
34 |
DuckDuckGoSearchTool,
|
35 |
GoogleSearchTool,
|
36 |
HfApiModel,
|
37 |
TransformersModel,
|
|
|
38 |
load_tool,
|
39 |
Tool,
|
40 |
tool,
|
41 |
-
ToolCollection
|
|
|
42 |
)
|
43 |
|
44 |
# load .env vars
|
45 |
load_dotenv()
|
46 |
|
47 |
-
|
48 |
# fast prototyping tools
|
49 |
@tool
|
50 |
def get_current_time_in_timezone(timezone: str) -> str:
|
@@ -107,6 +117,79 @@ def advanced_image_generation(description:str)->Image.Image:
|
|
107 |
return pil_image
|
108 |
|
109 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
# telemetry
|
111 |
def initialize_langfuse_opentelemetry_instrumentation():
|
112 |
LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
|
@@ -128,9 +211,10 @@ final_answer = FinalAnswerTool()
|
|
128 |
visit_webpage = VisitWebpageTool()
|
129 |
translation = TranslationTool()
|
130 |
best_model_for_task = HFModelDownloadsTool()
|
|
|
131 |
|
132 |
# load tools from smoloagents library
|
133 |
-
google_web_search = GoogleSearchTool()
|
134 |
google_web_search.name = "google_web_search"
|
135 |
duckduckgo_web_search = DuckDuckGoSearchTool()
|
136 |
duckduckgo_web_search.name = "duckduckgo_web_search"
|
@@ -148,13 +232,7 @@ image_generation_tool_fast = Tool.from_space(
|
|
148 |
)
|
149 |
|
150 |
|
151 |
-
|
152 |
-
model = HfApiModel(
|
153 |
-
max_tokens=2096,
|
154 |
-
temperature=0.5,
|
155 |
-
model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud',
|
156 |
-
custom_role_conversions=None,
|
157 |
-
)
|
158 |
|
159 |
with open("prompts.yaml", 'r') as stream:
|
160 |
prompt_templates = yaml.safe_load(stream)
|
@@ -165,26 +243,50 @@ tools = [
|
|
165 |
advanced_search_tool,
|
166 |
google_web_search,
|
167 |
duckduckgo_web_search,
|
168 |
-
visit_webpage,
|
|
|
169 |
get_current_time_in_timezone,
|
170 |
advanced_image_generation,
|
171 |
image_generation_tool,
|
|
|
172 |
language_detection,
|
173 |
-
translation
|
|
|
174 |
]
|
175 |
|
176 |
agent = CodeAgent(
|
177 |
-
model=
|
178 |
tools=tools,
|
179 |
-
max_steps=
|
180 |
-
verbosity_level=
|
181 |
grammar=None,
|
182 |
-
planning_interval=
|
183 |
-
name=
|
184 |
-
description=
|
185 |
-
prompt_templates=prompt_templates
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
)
|
187 |
|
|
|
|
|
188 |
# agent.push_to_hub('laverdes/Alfredo')
|
|
|
|
|
|
|
|
|
189 |
|
190 |
-
GradioUI(agent).launch()
|
|
|
|
1 |
+
import os
|
2 |
+
import base64
|
3 |
+
import math
|
4 |
import pytz
|
5 |
import yaml
|
6 |
import pycountry
|
7 |
+
import subprocess
|
8 |
+
import sys
|
9 |
|
10 |
from tools.final_answer import FinalAnswerTool
|
11 |
from tools.visit_webpage import VisitWebpageTool
|
12 |
from tools.translation import TranslationTool
|
13 |
from tools.best_model_for_task import HFModelDownloadsTool
|
14 |
+
from tools.rag_transformers import retriever_tool
|
15 |
|
16 |
from transformers import pipeline
|
17 |
from Gradio_UI import GradioUI
|
18 |
+
from Gradio_UI_with_image import GradioUIImage
|
|
|
|
|
19 |
from dotenv import load_dotenv
|
20 |
+
from datetime import datetime
|
21 |
+
from skimage import io
|
22 |
+
from PIL import Image
|
23 |
+
from typing import Optional, Tuple
|
24 |
|
25 |
from opentelemetry.sdk.trace import TracerProvider
|
26 |
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
|
|
|
33 |
from langchain_core.prompts import PromptTemplate
|
34 |
from langchain_openai import OpenAI
|
35 |
|
36 |
+
from io import BytesIO
|
37 |
+
from time import sleep
|
38 |
|
39 |
+
from smolagents.agents import ActionStep
|
40 |
+
from smolagents.cli import load_model
|
41 |
from smolagents import (
|
42 |
CodeAgent,
|
43 |
DuckDuckGoSearchTool,
|
44 |
GoogleSearchTool,
|
45 |
HfApiModel,
|
46 |
TransformersModel,
|
47 |
+
OpenAIServerModel,
|
48 |
load_tool,
|
49 |
Tool,
|
50 |
tool,
|
51 |
+
ToolCollection,
|
52 |
+
E2BExecutor
|
53 |
)
|
54 |
|
55 |
# load .env vars
|
56 |
load_dotenv()
|
57 |
|
|
|
58 |
# fast prototyping tools
|
59 |
@tool
|
60 |
def get_current_time_in_timezone(timezone: str) -> str:
|
|
|
117 |
return pil_image
|
118 |
|
119 |
|
120 |
+
@tool
|
121 |
+
def calculate_cargo_travel_time(
|
122 |
+
origin_coords: Tuple[float, float],
|
123 |
+
destination_coords: Tuple[float, float],
|
124 |
+
cruising_speed_kmh: Optional[float] = 750.0, # Average speed for cargo planes
|
125 |
+
) -> float:
|
126 |
+
"""
|
127 |
+
Calculate the travel time for a cargo plane between two points on Earth using great-circle distance.
|
128 |
+
|
129 |
+
Args:
|
130 |
+
origin_coords: Tuple of (latitude, longitude) for the starting point
|
131 |
+
destination_coords: Tuple of (latitude, longitude) for the destination
|
132 |
+
cruising_speed_kmh: Optional cruising speed in km/h (defaults to 750 km/h for typical cargo planes)
|
133 |
+
|
134 |
+
Returns:
|
135 |
+
float: The estimated travel time in hours
|
136 |
+
|
137 |
+
Example:
|
138 |
+
>>> # Chicago (41.8781° N, 87.6298° W) to Sydney (33.8688° S, 151.2093° E)
|
139 |
+
>>> result = calculate_cargo_travel_time((41.8781, -87.6298), (-33.8688, 151.2093))
|
140 |
+
"""
|
141 |
+
|
142 |
+
def to_radians(degrees: float) -> float:
|
143 |
+
return degrees * (math.pi / 180)
|
144 |
+
|
145 |
+
# Extract coordinates
|
146 |
+
lat1, lon1 = map(to_radians, origin_coords)
|
147 |
+
lat2, lon2 = map(to_radians, destination_coords)
|
148 |
+
|
149 |
+
# Earth's radius in kilometers
|
150 |
+
EARTH_RADIUS_KM = 6371.0
|
151 |
+
|
152 |
+
# Calculate great-circle distance using the haversine formula
|
153 |
+
dlon = lon2 - lon1
|
154 |
+
dlat = lat2 - lat1
|
155 |
+
|
156 |
+
a = (
|
157 |
+
math.sin(dlat / 2) ** 2
|
158 |
+
+ math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
|
159 |
+
)
|
160 |
+
c = 2 * math.asin(math.sqrt(a))
|
161 |
+
distance = EARTH_RADIUS_KM * c
|
162 |
+
|
163 |
+
# Add 10% to account for non-direct routes and air traffic controls
|
164 |
+
actual_distance = distance * 1.1
|
165 |
+
|
166 |
+
# Calculate flight time
|
167 |
+
# Add 1 hour for takeoff and landing procedures
|
168 |
+
flight_time = (actual_distance / cruising_speed_kmh) + 1.0
|
169 |
+
|
170 |
+
# Format the results
|
171 |
+
return round(flight_time, 2)
|
172 |
+
|
173 |
+
|
174 |
+
@tool
|
175 |
+
def browser_automation(original_user_query:str)->str:
|
176 |
+
"""
|
177 |
+
Browser automation is like “simulating a real user” and works for interactive,
|
178 |
+
dynamic sites and when visual navigation is required to show the process to the user.
|
179 |
+
Navigates the web using helium to answer a user query by appending helium_instructions to the original query
|
180 |
+
by searching for text matches through the navigation.
|
181 |
+
Args:
|
182 |
+
original_user_query: The original
|
183 |
+
"""
|
184 |
+
# Use sys.executable to ensure the same Python interpreter is used.
|
185 |
+
result = subprocess.run(
|
186 |
+
[sys.executable, "vision_web_browser.py", original_user_query],
|
187 |
+
capture_output=True, # Captures both stdout and stderr
|
188 |
+
text=True # Returns output as a string instead of bytes
|
189 |
+
)
|
190 |
+
print("vision_web_browser.py: ", result.stderr)
|
191 |
+
return result.stdout
|
192 |
+
|
193 |
# telemetry
|
194 |
def initialize_langfuse_opentelemetry_instrumentation():
|
195 |
LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
|
|
|
211 |
visit_webpage = VisitWebpageTool()
|
212 |
translation = TranslationTool()
|
213 |
best_model_for_task = HFModelDownloadsTool()
|
214 |
+
transformers_retriever = retriever_tool
|
215 |
|
216 |
# load tools from smoloagents library
|
217 |
+
google_web_search = GoogleSearchTool() # provider="serper" (SERPER_API_KEY) or "serpapi" (default)
|
218 |
google_web_search.name = "google_web_search"
|
219 |
duckduckgo_web_search = DuckDuckGoSearchTool()
|
220 |
duckduckgo_web_search.name = "duckduckgo_web_search"
|
|
|
232 |
)
|
233 |
|
234 |
|
235 |
+
ceo_model = load_model("LiteLLMModel", "gpt-4o") # or anthropic/claude-3-sonnet
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
|
237 |
with open("prompts.yaml", 'r') as stream:
|
238 |
prompt_templates = yaml.safe_load(stream)
|
|
|
243 |
advanced_search_tool,
|
244 |
google_web_search,
|
245 |
duckduckgo_web_search,
|
246 |
+
visit_webpage,
|
247 |
+
browser_automation,
|
248 |
get_current_time_in_timezone,
|
249 |
advanced_image_generation,
|
250 |
image_generation_tool,
|
251 |
+
transformers_retriever,
|
252 |
language_detection,
|
253 |
+
translation,
|
254 |
+
calculate_cargo_travel_time
|
255 |
]
|
256 |
|
257 |
agent = CodeAgent(
|
258 |
+
model=ceo_model,
|
259 |
tools=tools,
|
260 |
+
max_steps=20, # 15 is good for a light manager, too much when there is no need of a manager
|
261 |
+
verbosity_level=2,
|
262 |
grammar=None,
|
263 |
+
planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager)
|
264 |
+
name="Alfredo",
|
265 |
+
description="CEO",
|
266 |
+
prompt_templates=prompt_templates,
|
267 |
+
# executor_type="e2b", # security, could also be "docker" (set keys)
|
268 |
+
# sandbox=E2BSandbox() (or E2BExecutor?),
|
269 |
+
# step_callbacks=[save_screenshot], # todo: configure the web_navigation agent as a separate agent and mangage it with alfred
|
270 |
+
additional_authorized_imports=[
|
271 |
+
"geopandas",
|
272 |
+
"plotly",
|
273 |
+
"shapely",
|
274 |
+
"json",
|
275 |
+
"pandas",
|
276 |
+
"numpy",
|
277 |
+
"requests",
|
278 |
+
"helium",
|
279 |
+
],
|
280 |
+
# I could also add the authorized_imports from a LIST_SAFE_MODULES
|
281 |
)
|
282 |
|
283 |
+
agent.python_executor("from helium import *") # agent.state
|
284 |
+
|
285 |
# agent.push_to_hub('laverdes/Alfredo')
|
286 |
+
agent.visualize()
|
287 |
+
|
288 |
+
# prompt = ("navigate to a random wikipedia page and give me a summary of the content, then make a single image representing all the content")
|
289 |
+
# agent.run(prompt)
|
290 |
|
291 |
+
GradioUI(agent).launch()
|
292 |
+
#GradioUIImage(agent).launch()
|
multiagent_sandbox.py
ADDED
@@ -0,0 +1,294 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from e2b_code_interpreter import Sandbox
|
2 |
+
|
3 |
+
secure_sandbox = Sandbox()
|
4 |
+
|
5 |
+
secure_sandbox.commands.run("pip install smolagents")
|
6 |
+
|
7 |
+
def run_code_raise_errors(secure_sandbox, code: str, verbose: bool = False) -> str:
|
8 |
+
execution = secure_sandbox.run_code(
|
9 |
+
code,
|
10 |
+
envs={'HF_TOKEN': os.getenv('HF_TOKEN')}
|
11 |
+
)
|
12 |
+
if execution.error:
|
13 |
+
execution_logs = "\n".join([str(log) for log in execution.logs.stdout])
|
14 |
+
logs = execution_logs
|
15 |
+
logs += execution.error.traceback
|
16 |
+
raise ValueError(logs)
|
17 |
+
return "\n".join([str(log) for log in execution.logs.stdout])
|
18 |
+
|
19 |
+
alfredo_code = """
|
20 |
+
import os
|
21 |
+
import base64
|
22 |
+
import math
|
23 |
+
import pytz
|
24 |
+
import yaml
|
25 |
+
import pycountry
|
26 |
+
|
27 |
+
from tools.final_answer import FinalAnswerTool
|
28 |
+
from tools.visit_webpage import VisitWebpageTool
|
29 |
+
from tools.translation import TranslationTool
|
30 |
+
from tools.best_model_for_task import HFModelDownloadsTool
|
31 |
+
from tools.rag_transformers import retriever_tool
|
32 |
+
|
33 |
+
from transformers import pipeline
|
34 |
+
from Gradio_UI import GradioUI
|
35 |
+
from Gradio_UI_with_image import GradioUIImage
|
36 |
+
from dotenv import load_dotenv
|
37 |
+
from datetime import datetime
|
38 |
+
from skimage import io
|
39 |
+
from PIL import Image
|
40 |
+
from typing import Optional, Tuple
|
41 |
+
|
42 |
+
from opentelemetry.sdk.trace import TracerProvider
|
43 |
+
from openinference.instrumentation.smolagents import SmolagentsInstrumentor
|
44 |
+
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
|
45 |
+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
|
46 |
+
|
47 |
+
from langchain_community.agent_toolkits.load_tools import load_tools
|
48 |
+
from langchain.chains import LLMChain
|
49 |
+
from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
|
50 |
+
from langchain_core.prompts import PromptTemplate
|
51 |
+
from langchain_openai import OpenAI
|
52 |
+
from smolagents import (
|
53 |
+
CodeAgent,
|
54 |
+
DuckDuckGoSearchTool,
|
55 |
+
GoogleSearchTool,
|
56 |
+
HfApiModel,
|
57 |
+
TransformersModel,
|
58 |
+
OpenAIServerModel,
|
59 |
+
load_tool,
|
60 |
+
Tool,
|
61 |
+
tool,
|
62 |
+
ToolCollection
|
63 |
+
)
|
64 |
+
|
65 |
+
# load .env vars
|
66 |
+
load_dotenv()
|
67 |
+
|
68 |
+
|
69 |
+
|
70 |
+
# fast prototyping tools
|
71 |
+
@tool
|
72 |
+
def get_current_time_in_timezone(timezone: str) -> str:
|
73 |
+
"""A tool that fetches the current local time in a specified timezone formatted as '%m/%d/%y %H:%M:%S'
|
74 |
+
Args:
|
75 |
+
timezone (str): A string representing a valid timezone (e.g., 'America/New_York').
|
76 |
+
"""
|
77 |
+
try:
|
78 |
+
tz = pytz.timezone(timezone)
|
79 |
+
local_time = datetime.now(tz).strftime('%m/%d/%y %H:%M:%S')
|
80 |
+
return f"The current local time in {timezone} is: {local_time}"
|
81 |
+
except Exception as e:
|
82 |
+
return f"Error fetching time for timezone '{timezone}': {str(e)}"
|
83 |
+
|
84 |
+
|
85 |
+
@tool
|
86 |
+
def language_detection(text:str)-> str:
|
87 |
+
"""Detects the language of the input text using basic xlm-roberta-base-language-detection.
|
88 |
+
Args:
|
89 |
+
text: the input message or wording to detect language from.
|
90 |
+
"""
|
91 |
+
model_ckpt = "papluca/xlm-roberta-base-language-detection"
|
92 |
+
pipe = pipeline("text-classification", model=model_ckpt)
|
93 |
+
preds = pipe(text, return_all_scores=True, truncation=True, max_length=128)
|
94 |
+
if preds:
|
95 |
+
pred = preds[0]
|
96 |
+
language_probabilities_dict = {p["label"]: float(p["score"]) for p in pred}
|
97 |
+
predicted_language_code = max(language_probabilities_dict, key=language_probabilities_dict.get)
|
98 |
+
tool_prediction_confidence = language_probabilities_dict[predicted_language_code]
|
99 |
+
confidence_str = f"Tool Confidence: {tool_prediction_confidence}"
|
100 |
+
predicted_language_code_str = f"Predicted language code (ISO 639): {predicted_language_code}/n{confidence_str}"
|
101 |
+
try:
|
102 |
+
predicted_language = pycountry.languages.get(alpha_2=predicted_language_code)
|
103 |
+
if predicted_language:
|
104 |
+
predicted_language_str = f"Predicted language: {predicted_language.name}/n{confidence_str}"
|
105 |
+
return predicted_language_str
|
106 |
+
return predicted_language_code_str
|
107 |
+
|
108 |
+
except Exception as e:
|
109 |
+
return f"Error mapping country code to name (pycountry): {str(e)}/n{predicted_language_code_str}"
|
110 |
+
else:
|
111 |
+
return "None"
|
112 |
+
|
113 |
+
|
114 |
+
@tool
|
115 |
+
def advanced_image_generation(description:str)->Image.Image:
|
116 |
+
"""Generates an image using a textual description.
|
117 |
+
Args:
|
118 |
+
description: the textual description provided by the user to prompt a text-to-image model
|
119 |
+
"""
|
120 |
+
llm = OpenAI(temperature=0.9)
|
121 |
+
prompt = PromptTemplate(
|
122 |
+
input_variables=["image_desc"],
|
123 |
+
template="Generate a detailed but short prompt (must be less than 900 characters) to generate an image based on the following description: {image_desc}",
|
124 |
+
)
|
125 |
+
chain = LLMChain(llm=llm, prompt=prompt)
|
126 |
+
image_url = DallEAPIWrapper().run(chain.run(description))
|
127 |
+
image_array = io.imread(image_url)
|
128 |
+
pil_image = Image.fromarray(image_array)
|
129 |
+
return pil_image
|
130 |
+
|
131 |
+
|
132 |
+
@tool
|
133 |
+
def calculate_cargo_travel_time(
|
134 |
+
origin_coords: Tuple[float, float],
|
135 |
+
destination_coords: Tuple[float, float],
|
136 |
+
cruising_speed_kmh: Optional[float] = 750.0, # Average speed for cargo planes
|
137 |
+
) -> float:
|
138 |
+
"""
|
139 |
+
Calculate the travel time for a cargo plane between two points on Earth using great-circle distance.
|
140 |
+
|
141 |
+
Args:
|
142 |
+
origin_coords: Tuple of (latitude, longitude) for the starting point
|
143 |
+
destination_coords: Tuple of (latitude, longitude) for the destination
|
144 |
+
cruising_speed_kmh: Optional cruising speed in km/h (defaults to 750 km/h for typical cargo planes)
|
145 |
+
|
146 |
+
Returns:
|
147 |
+
float: The estimated travel time in hours
|
148 |
+
|
149 |
+
Example:
|
150 |
+
>>> # Chicago (41.8781° N, 87.6298° W) to Sydney (33.8688° S, 151.2093° E)
|
151 |
+
>>> result = calculate_cargo_travel_time((41.8781, -87.6298), (-33.8688, 151.2093))
|
152 |
+
"""
|
153 |
+
|
154 |
+
def to_radians(degrees: float) -> float:
|
155 |
+
return degrees * (math.pi / 180)
|
156 |
+
|
157 |
+
# Extract coordinates
|
158 |
+
lat1, lon1 = map(to_radians, origin_coords)
|
159 |
+
lat2, lon2 = map(to_radians, destination_coords)
|
160 |
+
|
161 |
+
# Earth's radius in kilometers
|
162 |
+
EARTH_RADIUS_KM = 6371.0
|
163 |
+
|
164 |
+
# Calculate great-circle distance using the haversine formula
|
165 |
+
dlon = lon2 - lon1
|
166 |
+
dlat = lat2 - lat1
|
167 |
+
|
168 |
+
a = (
|
169 |
+
math.sin(dlat / 2) ** 2
|
170 |
+
+ math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
|
171 |
+
)
|
172 |
+
c = 2 * math.asin(math.sqrt(a))
|
173 |
+
distance = EARTH_RADIUS_KM * c
|
174 |
+
|
175 |
+
# Add 10% to account for non-direct routes and air traffic controls
|
176 |
+
actual_distance = distance * 1.1
|
177 |
+
|
178 |
+
# Calculate flight time
|
179 |
+
# Add 1 hour for takeoff and landing procedures
|
180 |
+
flight_time = (actual_distance / cruising_speed_kmh) + 1.0
|
181 |
+
|
182 |
+
# Format the results
|
183 |
+
return round(flight_time, 2)
|
184 |
+
|
185 |
+
|
186 |
+
# telemetry
|
187 |
+
def initialize_langfuse_opentelemetry_instrumentation():
|
188 |
+
LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
|
189 |
+
LANGFUSE_SECRET_KEY=os.environ.get("LANGFUSE_SECRET_KEY")
|
190 |
+
LANGFUSE_AUTH=base64.b64encode(f"{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}".encode()).decode()
|
191 |
+
|
192 |
+
os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "https://cloud.langfuse.com/api/public/otel" # EU data region
|
193 |
+
os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic {LANGFUSE_AUTH}"
|
194 |
+
|
195 |
+
trace_provider = TracerProvider()
|
196 |
+
trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
|
197 |
+
|
198 |
+
SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
|
199 |
+
|
200 |
+
initialize_langfuse_opentelemetry_instrumentation()
|
201 |
+
|
202 |
+
# load tools from /tools/
|
203 |
+
final_answer = FinalAnswerTool()
|
204 |
+
visit_webpage = VisitWebpageTool()
|
205 |
+
translation = TranslationTool()
|
206 |
+
best_model_for_task = HFModelDownloadsTool()
|
207 |
+
transformers_retriever = retriever_tool
|
208 |
+
|
209 |
+
# load tools from smoloagents library
|
210 |
+
google_web_search = GoogleSearchTool() # provider="serper" (SERPER_API_KEY) or "serpapi" (default)
|
211 |
+
google_web_search.name = "google_web_search"
|
212 |
+
duckduckgo_web_search = DuckDuckGoSearchTool()
|
213 |
+
duckduckgo_web_search.name = "duckduckgo_web_search"
|
214 |
+
|
215 |
+
# load tools from hub and langchain
|
216 |
+
# image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
|
217 |
+
image_generation_tool = load_tool("m-ric/text-to-image", trust_remote_code=True) # Tool.from_space("black-forest-labs/FLUX.1-schnell", name="image_generator", description="Generate an image from a prompt")
|
218 |
+
advanced_search_tool = Tool.from_langchain(load_tools(["searchapi"], allow_dangerous_tools=True)[0]) # serpapi is not real time scrapping
|
219 |
+
advanced_search_tool.name = "advanced_search_tool"
|
220 |
+
|
221 |
+
image_generation_tool_fast = Tool.from_space(
|
222 |
+
"black-forest-labs/FLUX.1-schnell",
|
223 |
+
name="image_generator",
|
224 |
+
description="Generate an image from a prompt"
|
225 |
+
)
|
226 |
+
|
227 |
+
|
228 |
+
# alternative hf inference endpoint
|
229 |
+
"""
|
230 |
+
model = HfApiModel(
|
231 |
+
max_tokens=2096, # 8096 for manager
|
232 |
+
temperature=0.5,
|
233 |
+
model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # same as Qwen/Qwen2.5-Coder-32B-Instruct
|
234 |
+
custom_role_conversions=None,
|
235 |
+
)
|
236 |
+
"""
|
237 |
+
# also "deepseek-ai/DeepSeek-R1", # and provider="together" (get API key)
|
238 |
+
ceo_model = OpenAIServerModel(
|
239 |
+
max_tokens=8096, # 2096 or 5000 for other ligher agents (depending on the task)
|
240 |
+
temperature=0.5,
|
241 |
+
model_id="gpt-4o"
|
242 |
+
)
|
243 |
+
|
244 |
+
with open("prompts.yaml", 'r') as stream:
|
245 |
+
prompt_templates = yaml.safe_load(stream)
|
246 |
+
|
247 |
+
tools = [
|
248 |
+
final_answer,
|
249 |
+
best_model_for_task,
|
250 |
+
advanced_search_tool,
|
251 |
+
google_web_search,
|
252 |
+
duckduckgo_web_search,
|
253 |
+
visit_webpage,
|
254 |
+
get_current_time_in_timezone,
|
255 |
+
advanced_image_generation,
|
256 |
+
image_generation_tool,
|
257 |
+
transformers_retriever,
|
258 |
+
language_detection,
|
259 |
+
translation,
|
260 |
+
calculate_cargo_travel_time
|
261 |
+
]
|
262 |
+
|
263 |
+
agent = CodeAgent(
|
264 |
+
model=ceo_model,
|
265 |
+
tools=tools,
|
266 |
+
max_steps=15, # 15 is good for a light manager, too much when there is no need of a manager
|
267 |
+
verbosity_level=2,
|
268 |
+
grammar=None,
|
269 |
+
planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager)
|
270 |
+
name="Alfredo",
|
271 |
+
description="CEO",
|
272 |
+
prompt_templates=prompt_templates,
|
273 |
+
additional_authorized_imports=[
|
274 |
+
"geopandas",
|
275 |
+
"plotly",
|
276 |
+
"shapely",
|
277 |
+
"json",
|
278 |
+
"pandas",
|
279 |
+
"numpy",
|
280 |
+
"requests"
|
281 |
+
],
|
282 |
+
)
|
283 |
+
|
284 |
+
# agent.push_to_hub('laverdes/Alfredo')
|
285 |
+
agent.visualize()
|
286 |
+
|
287 |
+
GradioUI(agent).launch()
|
288 |
+
#GradioUIImage(agent).launch()
|
289 |
+
"""
|
290 |
+
execution_logs = run_code_raise_errors(secure_sandbox, agent_code)
|
291 |
+
print(execution_logs)
|
292 |
+
|
293 |
+
# todo: clean errors
|
294 |
+
# todo: the sandbox is to use in a single execution, not gradio and not receiving real-time user input()
|
tools/rag_transformers.py
ADDED
File without changes
|
vision_web_browser.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
from io import BytesIO
|
3 |
+
from time import sleep
|
4 |
+
import time
|
5 |
+
|
6 |
+
import helium
|
7 |
+
import PIL.Image
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from selenium import webdriver
|
10 |
+
from selenium.webdriver.common.by import By
|
11 |
+
from selenium.webdriver.common.keys import Keys
|
12 |
+
|
13 |
+
from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
|
14 |
+
from smolagents.agents import ActionStep
|
15 |
+
from smolagents.cli import load_model
|
16 |
+
|
17 |
+
|
18 |
+
github_request = """
|
19 |
+
I'm trying to find how hard I have to work to get a repo in github.com/trending.
|
20 |
+
Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
|
21 |
+
""" # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet.
|
22 |
+
|
23 |
+
search_request = """
|
24 |
+
Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
|
25 |
+
"""
|
26 |
+
|
27 |
+
|
28 |
+
def parse_arguments():
|
29 |
+
parser = argparse.ArgumentParser(description="Run a web browser automation script with a specified model.")
|
30 |
+
parser.add_argument(
|
31 |
+
"prompt",
|
32 |
+
type=str,
|
33 |
+
nargs="?", # Makes it optional
|
34 |
+
default=search_request,
|
35 |
+
help="The prompt to run with the agent",
|
36 |
+
)
|
37 |
+
parser.add_argument(
|
38 |
+
"--model-type",
|
39 |
+
type=str,
|
40 |
+
default="LiteLLMModel",
|
41 |
+
help="The model type to use (e.g., OpenAIServerModel, LiteLLMModel, TransformersModel, HfApiModel)",
|
42 |
+
)
|
43 |
+
parser.add_argument(
|
44 |
+
"--model-id",
|
45 |
+
type=str,
|
46 |
+
default="gpt-4o",
|
47 |
+
help="The model ID to use for the specified model type",
|
48 |
+
)
|
49 |
+
return parser.parse_args()
|
50 |
+
|
51 |
+
|
52 |
+
def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
|
53 |
+
sleep(1.0) # Let JavaScript animations happen before taking the screenshot
|
54 |
+
driver = helium.get_driver()
|
55 |
+
current_step = memory_step.step_number
|
56 |
+
if driver is not None:
|
57 |
+
for previous_memory_step in agent.memory.steps: # Remove previous screenshots from logs for lean processing
|
58 |
+
if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
|
59 |
+
previous_memory_step.observations_images = None
|
60 |
+
png_bytes = driver.get_screenshot_as_png()
|
61 |
+
image = PIL.Image.open(BytesIO(png_bytes))
|
62 |
+
print(f"Captured a browser screenshot: {image.size} pixels")
|
63 |
+
memory_step.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
|
64 |
+
|
65 |
+
# Update observations with current URL
|
66 |
+
url_info = f"Current url: {driver.current_url}"
|
67 |
+
memory_step.observations = (
|
68 |
+
url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
|
69 |
+
)
|
70 |
+
return
|
71 |
+
|
72 |
+
|
73 |
+
@tool
|
74 |
+
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
|
75 |
+
"""
|
76 |
+
Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
|
77 |
+
Args:
|
78 |
+
text: The text to search for
|
79 |
+
nth_result: Which occurrence to jump to (default: 1)
|
80 |
+
"""
|
81 |
+
elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
|
82 |
+
if nth_result > len(elements):
|
83 |
+
raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
|
84 |
+
result = f"Found {len(elements)} matches for '{text}'."
|
85 |
+
elem = elements[nth_result - 1]
|
86 |
+
driver.execute_script("arguments[0].scrollIntoView(true);", elem)
|
87 |
+
result += f"Focused on element {nth_result} of {len(elements)}"
|
88 |
+
return result
|
89 |
+
|
90 |
+
|
91 |
+
@tool
|
92 |
+
def go_back() -> None:
|
93 |
+
"""Goes back to previous page."""
|
94 |
+
driver.back()
|
95 |
+
|
96 |
+
|
97 |
+
@tool
|
98 |
+
def close_popups() -> str:
|
99 |
+
"""
|
100 |
+
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
|
101 |
+
"""
|
102 |
+
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
|
103 |
+
|
104 |
+
|
105 |
+
def initialize_driver():
|
106 |
+
"""Initialize the Selenium WebDriver."""
|
107 |
+
chrome_options = webdriver.ChromeOptions()
|
108 |
+
chrome_options.add_argument("--force-device-scale-factor=1")
|
109 |
+
chrome_options.add_argument("--window-size=1000,1350")
|
110 |
+
chrome_options.add_argument("--disable-pdf-viewer")
|
111 |
+
chrome_options.add_argument("--window-position=0,0")
|
112 |
+
return helium.start_chrome(headless=False, options=chrome_options)
|
113 |
+
|
114 |
+
|
115 |
+
def initialize_agent(model):
|
116 |
+
"""Initialize the CodeAgent with the specified model."""
|
117 |
+
return CodeAgent(
|
118 |
+
tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
|
119 |
+
model=model,
|
120 |
+
additional_authorized_imports=["helium"],
|
121 |
+
step_callbacks=[save_screenshot],
|
122 |
+
max_steps=20,
|
123 |
+
verbosity_level=2,
|
124 |
+
)
|
125 |
+
|
126 |
+
|
127 |
+
helium_instructions = """
|
128 |
+
Use your web_search tool when you want to get Google search results.
|
129 |
+
Then you can use helium to access websites. Don't use helium for Google search, only for navigating websites!
|
130 |
+
Don't bother about the helium driver, it's already managed.
|
131 |
+
We've already ran "from helium import *"
|
132 |
+
Then you can go to pages!
|
133 |
+
Code:
|
134 |
+
```py
|
135 |
+
go_to('github.com/trending')
|
136 |
+
```<end_code>
|
137 |
+
|
138 |
+
You can directly click clickable elements by inputting the text that appears on them.
|
139 |
+
Code:
|
140 |
+
```py
|
141 |
+
click("Top products")
|
142 |
+
```<end_code>
|
143 |
+
|
144 |
+
If it's a link:
|
145 |
+
Code:
|
146 |
+
```py
|
147 |
+
click(Link("Top products"))
|
148 |
+
```<end_code>
|
149 |
+
|
150 |
+
If you try to interact with an element and it's not found, you'll get a LookupError.
|
151 |
+
In general stop your action after each button click to see what happens on your screenshot.
|
152 |
+
Never try to login in a page.
|
153 |
+
|
154 |
+
To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
|
155 |
+
Code:
|
156 |
+
```py
|
157 |
+
scroll_down(num_pixels=1200) # This will scroll one viewport down
|
158 |
+
```<end_code>
|
159 |
+
|
160 |
+
When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
|
161 |
+
Just use your built-in tool `close_popups` to close them:
|
162 |
+
Code:
|
163 |
+
```py
|
164 |
+
close_popups()
|
165 |
+
```<end_code>
|
166 |
+
|
167 |
+
You can use .exists() to check for the existence of an element. For example:
|
168 |
+
Code:
|
169 |
+
```py
|
170 |
+
if Text('Accept cookies?').exists():
|
171 |
+
click('I accept')
|
172 |
+
```<end_code>
|
173 |
+
|
174 |
+
Proceed in several steps rather than trying to solve the task in one shot.
|
175 |
+
And at the end, only when you have your answer, return your final answer.
|
176 |
+
Code:
|
177 |
+
```py
|
178 |
+
final_answer("YOUR_ANSWER_HERE")
|
179 |
+
```<end_code>
|
180 |
+
|
181 |
+
If pages seem stuck on loading, you might have to wait, for instance `import time` and run `time.sleep(5.0)`. But don't overuse this!
|
182 |
+
To list elements on page, DO NOT try code-based element searches like 'contributors = find_all(S("ol > li"))': just look at the latest screenshot you have and read it visually, or use your tool search_item_ctrl_f.
|
183 |
+
Of course, you can act on buttons like a user would do when navigating.
|
184 |
+
After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url.
|
185 |
+
But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states.
|
186 |
+
Don't kill the browser.
|
187 |
+
When you have modals or cookie banners on screen, you should get rid of them before you can click anything else.
|
188 |
+
"""
|
189 |
+
|
190 |
+
|
191 |
+
def main(prompt: str, model_type: str, model_id: str) -> None:
|
192 |
+
# Load environment variables
|
193 |
+
load_dotenv()
|
194 |
+
|
195 |
+
# Initialize the model based on the provided arguments
|
196 |
+
model = load_model(model_type, model_id)
|
197 |
+
|
198 |
+
global driver
|
199 |
+
driver = initialize_driver()
|
200 |
+
agent = initialize_agent(model)
|
201 |
+
|
202 |
+
# Run the agent with the provided prompt
|
203 |
+
agent.python_executor("from helium import *")
|
204 |
+
agent.run(prompt + helium_instructions)
|
205 |
+
|
206 |
+
|
207 |
+
if __name__ == "__main__":
|
208 |
+
# Parse command line arguments
|
209 |
+
args = parse_arguments()
|
210 |
+
|
211 |
+
main(args.prompt, args.model_type, args.model_id)
|