Alfredo

Runtime error

App Files Files Community

laverdes commited on Mar 27

Commit

84ac217

1 Parent(s): be9a1ee

feat: browser_automation and gpt4o manager

Browse files

Files changed (4) hide show

app.py +128 -26
multiagent_sandbox.py +294 -0
tools/rag_transformers.py +0 -0
vision_web_browser.py +211 -0

app.py CHANGED Viewed

@@ -1,19 +1,26 @@
-from datetime import datetime
 import pytz
 import yaml
 import pycountry
 from tools.final_answer import FinalAnswerTool
 from tools.visit_webpage import VisitWebpageTool
 from tools.translation import TranslationTool
 from tools.best_model_for_task import HFModelDownloadsTool
 from transformers import pipeline
 from Gradio_UI import GradioUI
-import os
-import base64
 from dotenv import load_dotenv
 from opentelemetry.sdk.trace import TracerProvider
 from openinference.instrumentation.smolagents import SmolagentsInstrumentor
@@ -26,25 +33,28 @@ from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
 from langchain_core.prompts import PromptTemplate
 from langchain_openai import OpenAI
-from skimage import io
-from PIL import Image
 from smolagents import (
     CodeAgent,
     DuckDuckGoSearchTool,
     GoogleSearchTool,
     HfApiModel,
     TransformersModel,
     load_tool,
     Tool,
     tool,
-    ToolCollection
 )
 # load .env vars
 load_dotenv()
 # fast prototyping tools
 @tool
 def get_current_time_in_timezone(timezone: str) -> str:
@@ -107,6 +117,79 @@ def advanced_image_generation(description:str)->Image.Image:
     return pil_image
 # telemetry
 def initialize_langfuse_opentelemetry_instrumentation():
     LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
@@ -128,9 +211,10 @@ final_answer = FinalAnswerTool()
 visit_webpage = VisitWebpageTool()
 translation = TranslationTool()
 best_model_for_task = HFModelDownloadsTool()
 # load tools from smoloagents library
-google_web_search = GoogleSearchTool()
 google_web_search.name = "google_web_search"
 duckduckgo_web_search = DuckDuckGoSearchTool()
 duckduckgo_web_search.name = "duckduckgo_web_search"
@@ -148,13 +232,7 @@ image_generation_tool_fast = Tool.from_space(
 )
-# alternative hf inference endpoint
-model = HfApiModel(
-max_tokens=2096,
-temperature=0.5,
-model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud',
-custom_role_conversions=None,
-)
 with open("prompts.yaml", 'r') as stream:
     prompt_templates = yaml.safe_load(stream)
@@ -165,26 +243,50 @@ tools = [
         advanced_search_tool,
         google_web_search,
         duckduckgo_web_search,
-        visit_webpage,
         get_current_time_in_timezone,
         advanced_image_generation,
         image_generation_tool,
         language_detection,
-        translation
     ]
 agent = CodeAgent(
-    model=model,
     tools=tools,
-    max_steps=10,
-    verbosity_level=1,
     grammar=None,
-    planning_interval=None,
-    name=None,
-    description=None,
-    prompt_templates=prompt_templates
 )
 # agent.push_to_hub('laverdes/Alfredo')
-GradioUI(agent).launch()

+import os
+import base64
+import math
 import pytz
 import yaml
 import pycountry
+import subprocess
+import sys
 from tools.final_answer import FinalAnswerTool
 from tools.visit_webpage import VisitWebpageTool
 from tools.translation import TranslationTool
 from tools.best_model_for_task import HFModelDownloadsTool
+from tools.rag_transformers import retriever_tool
 from transformers import pipeline
 from Gradio_UI import GradioUI
+from Gradio_UI_with_image import GradioUIImage
 from dotenv import load_dotenv
+from datetime import datetime
+from skimage import io
+from PIL import Image
+from typing import Optional, Tuple
 from opentelemetry.sdk.trace import TracerProvider
 from openinference.instrumentation.smolagents import SmolagentsInstrumentor
 from langchain_core.prompts import PromptTemplate
 from langchain_openai import OpenAI
+from io import BytesIO
+from time import sleep
+from smolagents.agents import ActionStep
+from smolagents.cli import load_model
 from smolagents import (
     CodeAgent,
     DuckDuckGoSearchTool,
     GoogleSearchTool,
     HfApiModel,
     TransformersModel,
+    OpenAIServerModel,
     load_tool,
     Tool,
     tool,
+    ToolCollection,
+    E2BExecutor
 )
 # load .env vars
 load_dotenv()
 # fast prototyping tools
 @tool
 def get_current_time_in_timezone(timezone: str) -> str:
     return pil_image
+@tool
+def calculate_cargo_travel_time(
+    origin_coords: Tuple[float, float],
+    destination_coords: Tuple[float, float],
+    cruising_speed_kmh: Optional[float] = 750.0,  # Average speed for cargo planes
+) -> float:
+    """
+    Calculate the travel time for a cargo plane between two points on Earth using great-circle distance.
+    Args:
+        origin_coords: Tuple of (latitude, longitude) for the starting point
+        destination_coords: Tuple of (latitude, longitude) for the destination
+        cruising_speed_kmh: Optional cruising speed in km/h (defaults to 750 km/h for typical cargo planes)
+    Returns:
+        float: The estimated travel time in hours
+    Example:
+        >>> # Chicago (41.8781° N, 87.6298° W) to Sydney (33.8688° S, 151.2093° E)
+        >>> result = calculate_cargo_travel_time((41.8781, -87.6298), (-33.8688, 151.2093))
+    """
+    def to_radians(degrees: float) -> float:
+        return degrees * (math.pi / 180)
+    # Extract coordinates
+    lat1, lon1 = map(to_radians, origin_coords)
+    lat2, lon2 = map(to_radians, destination_coords)
+    # Earth's radius in kilometers
+    EARTH_RADIUS_KM = 6371.0
+    # Calculate great-circle distance using the haversine formula
+    dlon = lon2 - lon1
+    dlat = lat2 - lat1
+    a = (
+        math.sin(dlat / 2) ** 2
+        + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
+    )
+    c = 2 * math.asin(math.sqrt(a))
+    distance = EARTH_RADIUS_KM * c
+    # Add 10% to account for non-direct routes and air traffic controls
+    actual_distance = distance * 1.1
+    # Calculate flight time
+    # Add 1 hour for takeoff and landing procedures
+    flight_time = (actual_distance / cruising_speed_kmh) + 1.0
+    # Format the results
+    return round(flight_time, 2)
+@tool
+def browser_automation(original_user_query:str)->str:
+    """
+    Browser automation is like “simulating a real user” and works for interactive,
+    dynamic sites and when visual navigation is required to show the process to the user.
+    Navigates the web using helium to answer a user query by appending helium_instructions to the original query
+    by searching for text matches through the navigation.
+    Args:
+        original_user_query: The original
+    """
+    # Use sys.executable to ensure the same Python interpreter is used.
+    result = subprocess.run(
+        [sys.executable, "vision_web_browser.py", original_user_query],
+        capture_output=True,  # Captures both stdout and stderr
+        text=True  # Returns output as a string instead of bytes
+    )
+    print("vision_web_browser.py: ", result.stderr)
+    return result.stdout
 # telemetry
 def initialize_langfuse_opentelemetry_instrumentation():
     LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
 visit_webpage = VisitWebpageTool()
 translation = TranslationTool()
 best_model_for_task = HFModelDownloadsTool()
+transformers_retriever = retriever_tool
 # load tools from smoloagents library
+google_web_search = GoogleSearchTool()  # provider="serper" (SERPER_API_KEY) or "serpapi" (default)
 google_web_search.name = "google_web_search"
 duckduckgo_web_search = DuckDuckGoSearchTool()
 duckduckgo_web_search.name = "duckduckgo_web_search"
 )
+ceo_model = load_model("LiteLLMModel", "gpt-4o")   # or anthropic/claude-3-sonnet
 with open("prompts.yaml", 'r') as stream:
     prompt_templates = yaml.safe_load(stream)
         advanced_search_tool,
         google_web_search,
         duckduckgo_web_search,
+        visit_webpage,
+        browser_automation,
         get_current_time_in_timezone,
         advanced_image_generation,
         image_generation_tool,
+        transformers_retriever,
         language_detection,
+        translation,
+        calculate_cargo_travel_time
     ]
 agent = CodeAgent(
+    model=ceo_model,
     tools=tools,
+    max_steps=20,  # 15 is good for a light manager, too much when there is no need of a manager
+    verbosity_level=2,
     grammar=None,
+    planning_interval=5,  # (add more steps for heavier reasoning, leave default if not manager)
+    name="Alfredo",
+    description="CEO",
+    prompt_templates=prompt_templates,
+    # executor_type="e2b",  # security, could also be "docker" (set keys)
+    # sandbox=E2BSandbox()  (or E2BExecutor?),
+    # step_callbacks=[save_screenshot],  # todo: configure the web_navigation agent as a separate agent and mangage it with alfred
+    additional_authorized_imports=[
+        "geopandas",
+        "plotly",
+        "shapely",
+        "json",
+        "pandas",
+        "numpy",
+        "requests",
+        "helium",
+    ],
+    # I could also add the authorized_imports from a LIST_SAFE_MODULES
 )
+agent.python_executor("from helium import *")   # agent.state
 # agent.push_to_hub('laverdes/Alfredo')
+agent.visualize()
+# prompt = ("navigate to a random wikipedia page and give me a summary of the content, then make a single image representing all the content")
+# agent.run(prompt)
+GradioUI(agent).launch()
+#GradioUIImage(agent).launch()

multiagent_sandbox.py ADDED Viewed

	@@ -0,0 +1,294 @@

+from e2b_code_interpreter import Sandbox
+secure_sandbox = Sandbox()
+secure_sandbox.commands.run("pip install smolagents")
+def run_code_raise_errors(secure_sandbox, code: str, verbose: bool = False) -> str:
+    execution = secure_sandbox.run_code(
+        code,
+        envs={'HF_TOKEN': os.getenv('HF_TOKEN')}
+    )
+    if execution.error:
+        execution_logs = "\n".join([str(log) for log in execution.logs.stdout])
+        logs = execution_logs
+        logs += execution.error.traceback
+        raise ValueError(logs)
+    return "\n".join([str(log) for log in execution.logs.stdout])
+alfredo_code = """
+import os
+import base64
+import math
+import pytz
+import yaml
+import pycountry
+from tools.final_answer import FinalAnswerTool
+from tools.visit_webpage import VisitWebpageTool
+from tools.translation import TranslationTool
+from tools.best_model_for_task import HFModelDownloadsTool
+from tools.rag_transformers import retriever_tool
+from transformers import pipeline
+from Gradio_UI import GradioUI
+from Gradio_UI_with_image import GradioUIImage
+from dotenv import load_dotenv
+from datetime import datetime
+from skimage import io
+from PIL import Image
+from typing import Optional, Tuple
+from opentelemetry.sdk.trace import TracerProvider
+from openinference.instrumentation.smolagents import SmolagentsInstrumentor
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+from opentelemetry.sdk.trace.export import SimpleSpanProcessor
+from langchain_community.agent_toolkits.load_tools import load_tools
+from langchain.chains import LLMChain
+from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper
+from langchain_core.prompts import PromptTemplate
+from langchain_openai import OpenAI
+from smolagents import (
+    CodeAgent,
+    DuckDuckGoSearchTool,
+    GoogleSearchTool,
+    HfApiModel,
+    TransformersModel,
+    OpenAIServerModel,
+    load_tool,
+    Tool,
+    tool,
+    ToolCollection
+)
+# load .env vars
+load_dotenv()
+# fast prototyping tools
+@tool
+def get_current_time_in_timezone(timezone: str) -> str:
+    """A tool that fetches the current local time in a specified timezone formatted as '%m/%d/%y %H:%M:%S'
+    Args:
+        timezone (str): A string representing a valid timezone (e.g., 'America/New_York').
+    """
+    try:
+        tz = pytz.timezone(timezone)
+        local_time = datetime.now(tz).strftime('%m/%d/%y %H:%M:%S')
+        return f"The current local time in {timezone} is: {local_time}"
+    except Exception as e:
+        return f"Error fetching time for timezone '{timezone}': {str(e)}"
+@tool
+def language_detection(text:str)-> str:
+    """Detects the language of the input text using basic xlm-roberta-base-language-detection.
+     Args:
+        text: the input message or wording to detect language from.
+    """
+    model_ckpt = "papluca/xlm-roberta-base-language-detection"
+    pipe = pipeline("text-classification", model=model_ckpt)
+    preds = pipe(text, return_all_scores=True, truncation=True, max_length=128)
+    if preds:
+        pred = preds[0]
+        language_probabilities_dict = {p["label"]: float(p["score"]) for p in pred}
+        predicted_language_code = max(language_probabilities_dict, key=language_probabilities_dict.get)
+        tool_prediction_confidence = language_probabilities_dict[predicted_language_code]
+        confidence_str = f"Tool Confidence: {tool_prediction_confidence}"
+        predicted_language_code_str = f"Predicted language code (ISO 639): {predicted_language_code}/n{confidence_str}"
+        try:
+            predicted_language = pycountry.languages.get(alpha_2=predicted_language_code)
+            if predicted_language:
+                predicted_language_str = f"Predicted language: {predicted_language.name}/n{confidence_str}"
+                return predicted_language_str
+            return predicted_language_code_str
+        except Exception as e:
+            return f"Error mapping country code to name (pycountry): {str(e)}/n{predicted_language_code_str}"
+    else:
+        return "None"
+@tool
+def advanced_image_generation(description:str)->Image.Image:
+    """Generates an image using a textual description.
+         Args:
+            description: the textual description provided by the user to prompt a text-to-image model
+        """
+    llm = OpenAI(temperature=0.9)
+    prompt = PromptTemplate(
+        input_variables=["image_desc"],
+        template="Generate a detailed but short prompt (must be less than 900 characters) to generate an image based on the following description: {image_desc}",
+    )
+    chain = LLMChain(llm=llm, prompt=prompt)
+    image_url = DallEAPIWrapper().run(chain.run(description))
+    image_array = io.imread(image_url)
+    pil_image = Image.fromarray(image_array)
+    return pil_image
+@tool
+def calculate_cargo_travel_time(
+    origin_coords: Tuple[float, float],
+    destination_coords: Tuple[float, float],
+    cruising_speed_kmh: Optional[float] = 750.0,  # Average speed for cargo planes
+) -> float:
+    """
+    Calculate the travel time for a cargo plane between two points on Earth using great-circle distance.
+    Args:
+        origin_coords: Tuple of (latitude, longitude) for the starting point
+        destination_coords: Tuple of (latitude, longitude) for the destination
+        cruising_speed_kmh: Optional cruising speed in km/h (defaults to 750 km/h for typical cargo planes)
+    Returns:
+        float: The estimated travel time in hours
+    Example:
+        >>> # Chicago (41.8781° N, 87.6298° W) to Sydney (33.8688° S, 151.2093° E)
+        >>> result = calculate_cargo_travel_time((41.8781, -87.6298), (-33.8688, 151.2093))
+    """
+    def to_radians(degrees: float) -> float:
+        return degrees * (math.pi / 180)
+    # Extract coordinates
+    lat1, lon1 = map(to_radians, origin_coords)
+    lat2, lon2 = map(to_radians, destination_coords)
+    # Earth's radius in kilometers
+    EARTH_RADIUS_KM = 6371.0
+    # Calculate great-circle distance using the haversine formula
+    dlon = lon2 - lon1
+    dlat = lat2 - lat1
+    a = (
+        math.sin(dlat / 2) ** 2
+        + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
+    )
+    c = 2 * math.asin(math.sqrt(a))
+    distance = EARTH_RADIUS_KM * c
+    # Add 10% to account for non-direct routes and air traffic controls
+    actual_distance = distance * 1.1
+    # Calculate flight time
+    # Add 1 hour for takeoff and landing procedures
+    flight_time = (actual_distance / cruising_speed_kmh) + 1.0
+    # Format the results
+    return round(flight_time, 2)
+# telemetry
+def initialize_langfuse_opentelemetry_instrumentation():
+    LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY")
+    LANGFUSE_SECRET_KEY=os.environ.get("LANGFUSE_SECRET_KEY")
+    LANGFUSE_AUTH=base64.b64encode(f"{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}".encode()).decode()
+    os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "https://cloud.langfuse.com/api/public/otel" # EU data region
+    os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic {LANGFUSE_AUTH}"
+    trace_provider = TracerProvider()
+    trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter()))
+    SmolagentsInstrumentor().instrument(tracer_provider=trace_provider)
+initialize_langfuse_opentelemetry_instrumentation()
+# load tools from /tools/
+final_answer = FinalAnswerTool()
+visit_webpage = VisitWebpageTool()
+translation = TranslationTool()
+best_model_for_task = HFModelDownloadsTool()
+transformers_retriever = retriever_tool
+# load tools from smoloagents library
+google_web_search = GoogleSearchTool()  # provider="serper" (SERPER_API_KEY) or "serpapi" (default)
+google_web_search.name = "google_web_search"
+duckduckgo_web_search = DuckDuckGoSearchTool()
+duckduckgo_web_search.name = "duckduckgo_web_search"
+# load tools from hub and langchain
+# image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
+image_generation_tool = load_tool("m-ric/text-to-image", trust_remote_code=True)  # Tool.from_space("black-forest-labs/FLUX.1-schnell", name="image_generator", description="Generate an image from a prompt")
+advanced_search_tool = Tool.from_langchain(load_tools(["searchapi"], allow_dangerous_tools=True)[0])  # serpapi is not real time scrapping
+advanced_search_tool.name = "advanced_search_tool"
+image_generation_tool_fast = Tool.from_space(
+    "black-forest-labs/FLUX.1-schnell",
+    name="image_generator",
+    description="Generate an image from a prompt"
+)
+# alternative hf inference endpoint
+"""
+model = HfApiModel(
+max_tokens=2096,  # 8096 for manager
+temperature=0.5,
+model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud',  # same as Qwen/Qwen2.5-Coder-32B-Instruct
+custom_role_conversions=None,
+)
+"""
+# also "deepseek-ai/DeepSeek-R1",  # and provider="together" (get API key)
+ceo_model = OpenAIServerModel(
+    max_tokens=8096,  # 2096 or 5000 for other ligher agents (depending on the task)
+    temperature=0.5,
+    model_id="gpt-4o"
+)
+with open("prompts.yaml", 'r') as stream:
+    prompt_templates = yaml.safe_load(stream)
+tools = [
+        final_answer,
+        best_model_for_task,
+        advanced_search_tool,
+        google_web_search,
+        duckduckgo_web_search,
+        visit_webpage,
+        get_current_time_in_timezone,
+        advanced_image_generation,
+        image_generation_tool,
+        transformers_retriever,
+        language_detection,
+        translation,
+        calculate_cargo_travel_time
+    ]
+agent = CodeAgent(
+    model=ceo_model,
+    tools=tools,
+    max_steps=15,  # 15 is good for a light manager, too much when there is no need of a manager
+    verbosity_level=2,
+    grammar=None,
+    planning_interval=5,  # (add more steps for heavier reasoning, leave default if not manager)
+    name="Alfredo",
+    description="CEO",
+    prompt_templates=prompt_templates,
+    additional_authorized_imports=[
+        "geopandas",
+        "plotly",
+        "shapely",
+        "json",
+        "pandas",
+        "numpy",
+        "requests"
+    ],
+)
+# agent.push_to_hub('laverdes/Alfredo')
+agent.visualize()
+GradioUI(agent).launch()
+#GradioUIImage(agent).launch()
+"""
+execution_logs = run_code_raise_errors(secure_sandbox, agent_code)
+print(execution_logs)
+# todo: clean errors
+# todo: the sandbox is to use in a single execution, not gradio and not receiving real-time user input()

tools/rag_transformers.py ADDED Viewed

File without changes

vision_web_browser.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import argparse
+from io import BytesIO
+from time import sleep
+import time
+import helium
+import PIL.Image
+from dotenv import load_dotenv
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.common.keys import Keys
+from smolagents import CodeAgent, DuckDuckGoSearchTool, tool
+from smolagents.agents import ActionStep
+from smolagents.cli import load_model
+github_request = """
+I'm trying to find how hard I have to work to get a repo in github.com/trending.
+Can you navigate to the profile for the top author of the top trending repo, and give me their total number of commits over the last year?
+"""  # The agent is able to achieve this request only when powered by GPT-4o or Claude-3.5-sonnet.
+search_request = """
+Please navigate to https://en.wikipedia.org/wiki/Chicago and give me a sentence containing the word "1992" that mentions a construction accident.
+"""
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Run a web browser automation script with a specified model.")
+    parser.add_argument(
+        "prompt",
+        type=str,
+        nargs="?",  # Makes it optional
+        default=search_request,
+        help="The prompt to run with the agent",
+    )
+    parser.add_argument(
+        "--model-type",
+        type=str,
+        default="LiteLLMModel",
+        help="The model type to use (e.g., OpenAIServerModel, LiteLLMModel, TransformersModel, HfApiModel)",
+    )
+    parser.add_argument(
+        "--model-id",
+        type=str,
+        default="gpt-4o",
+        help="The model ID to use for the specified model type",
+    )
+    return parser.parse_args()
+def save_screenshot(memory_step: ActionStep, agent: CodeAgent) -> None:
+    sleep(1.0)  # Let JavaScript animations happen before taking the screenshot
+    driver = helium.get_driver()
+    current_step = memory_step.step_number
+    if driver is not None:
+        for previous_memory_step in agent.memory.steps:  # Remove previous screenshots from logs for lean processing
+            if isinstance(previous_memory_step, ActionStep) and previous_memory_step.step_number <= current_step - 2:
+                previous_memory_step.observations_images = None
+        png_bytes = driver.get_screenshot_as_png()
+        image = PIL.Image.open(BytesIO(png_bytes))
+        print(f"Captured a browser screenshot: {image.size} pixels")
+        memory_step.observations_images = [image.copy()]  # Create a copy to ensure it persists, important!
+    # Update observations with current URL
+    url_info = f"Current url: {driver.current_url}"
+    memory_step.observations = (
+        url_info if memory_step.observations is None else memory_step.observations + "\n" + url_info
+    )
+    return
+@tool
+def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
+    """
+    Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
+    Args:
+        text: The text to search for
+        nth_result: Which occurrence to jump to (default: 1)
+    """
+    elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
+    if nth_result > len(elements):
+        raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
+    result = f"Found {len(elements)} matches for '{text}'."
+    elem = elements[nth_result - 1]
+    driver.execute_script("arguments[0].scrollIntoView(true);", elem)
+    result += f"Focused on element {nth_result} of {len(elements)}"
+    return result
+@tool
+def go_back() -> None:
+    """Goes back to previous page."""
+    driver.back()
+@tool
+def close_popups() -> str:
+    """
+    Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
+    """
+    webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
+def initialize_driver():
+    """Initialize the Selenium WebDriver."""
+    chrome_options = webdriver.ChromeOptions()
+    chrome_options.add_argument("--force-device-scale-factor=1")
+    chrome_options.add_argument("--window-size=1000,1350")
+    chrome_options.add_argument("--disable-pdf-viewer")
+    chrome_options.add_argument("--window-position=0,0")
+    return helium.start_chrome(headless=False, options=chrome_options)
+def initialize_agent(model):
+    """Initialize the CodeAgent with the specified model."""
+    return CodeAgent(
+        tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
+        model=model,
+        additional_authorized_imports=["helium"],
+        step_callbacks=[save_screenshot],
+        max_steps=20,
+        verbosity_level=2,
+    )
+helium_instructions = """
+Use your web_search tool when you want to get Google search results.
+Then you can use helium to access websites. Don't use helium for Google search, only for navigating websites!
+Don't bother about the helium driver, it's already managed.
+We've already ran "from helium import *"
+Then you can go to pages!
+Code:
+```py
+go_to('github.com/trending')
+```<end_code>
+You can directly click clickable elements by inputting the text that appears on them.
+Code:
+```py
+click("Top products")
+```<end_code>
+If it's a link:
+Code:
+```py
+click(Link("Top products"))
+```<end_code>
+If you try to interact with an element and it's not found, you'll get a LookupError.
+In general stop your action after each button click to see what happens on your screenshot.
+Never try to login in a page.
+To scroll up or down, use scroll_down or scroll_up with as an argument the number of pixels to scroll from.
+Code:
+```py
+scroll_down(num_pixels=1200) # This will scroll one viewport down
+```<end_code>
+When you have pop-ups with a cross icon to close, don't try to click the close icon by finding its element or targeting an 'X' element (this most often fails).
+Just use your built-in tool `close_popups` to close them:
+Code:
+```py
+close_popups()
+```<end_code>
+You can use .exists() to check for the existence of an element. For example:
+Code:
+```py
+if Text('Accept cookies?').exists():
+    click('I accept')
+```<end_code>
+Proceed in several steps rather than trying to solve the task in one shot.
+And at the end, only when you have your answer, return your final answer.
+Code:
+```py
+final_answer("YOUR_ANSWER_HERE")
+```<end_code>
+If pages seem stuck on loading, you might have to wait, for instance `import time` and run `time.sleep(5.0)`. But don't overuse this!
+To list elements on page, DO NOT try code-based element searches like 'contributors = find_all(S("ol > li"))': just look at the latest screenshot you have and read it visually, or use your tool search_item_ctrl_f.
+Of course, you can act on buttons like a user would do when navigating.
+After each code blob you write, you will be automatically provided with an updated screenshot of the browser and the current browser url.
+But beware that the screenshot will only be taken at the end of the whole action, it won't see intermediate states.
+Don't kill the browser.
+When you have modals or cookie banners on screen, you should get rid of them before you can click anything else.
+"""
+def main(prompt: str, model_type: str, model_id: str) -> None:
+    # Load environment variables
+    load_dotenv()
+    # Initialize the model based on the provided arguments
+    model = load_model(model_type, model_id)
+    global driver
+    driver = initialize_driver()
+    agent = initialize_agent(model)
+    # Run the agent with the provided prompt
+    agent.python_executor("from helium import *")
+    agent.run(prompt + helium_instructions)
+if __name__ == "__main__":
+    # Parse command line arguments
+    args = parse_arguments()
+    main(args.prompt, args.model_type, args.model_id)