import os import base64 import math import pytz import torch import yaml import pycountry import subprocess import sys import numpy as np import sounddevice as sd from tools.final_answer import FinalAnswerTool from tools.visit_webpage import VisitWebpageTool from tools.translation import TranslationTool from tools.best_model_for_task import HFModelDownloadsTool from tools.rag_transformers import retriever_tool from transformers import pipeline from Gradio_UI import GradioUI from Gradio_UI_with_image import GradioUIImage from dotenv import load_dotenv from datetime import datetime from skimage import io from PIL import Image from typing import Optional, Tuple from opentelemetry.sdk.trace import TracerProvider from openinference.instrumentation.smolagents import SmolagentsInstrumentor from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter from opentelemetry.sdk.trace.export import SimpleSpanProcessor from langchain_community.agent_toolkits.load_tools import load_tools from langchain.chains import LLMChain from langchain_community.utilities.dalle_image_generator import DallEAPIWrapper from langchain_core.prompts import PromptTemplate from langchain_openai import OpenAI from transformers import AutoTokenizer from io import BytesIO from time import sleep from smolagents.utils import BASE_BUILTIN_MODULES from smolagents.agents import ActionStep from smolagents.cli import load_model from smolagents import ( CodeAgent, DuckDuckGoSearchTool, GoogleSearchTool, HfApiModel, TransformersModel, OpenAIServerModel, load_tool, Tool, tool, ToolCollection, E2BExecutor ) # load .env vars load_dotenv() BASE_BUILTIN_MODULES.remove("re") # fast prototyping tools @tool def get_current_time_in_timezone(timezone: str) -> str: """A tool that fetches the current local time in a specified timezone formatted as '%m/%d/%y %H:%M:%S' Args: timezone (str): A string representing a valid timezone (e.g., 'America/New_York'). """ try: tz = pytz.timezone(timezone) local_time = datetime.now(tz).strftime('%m/%d/%y %H:%M:%S') return f"The current local time in {timezone} is: {local_time}" except Exception as e: return f"Error fetching time for timezone '{timezone}': {str(e)}" @tool def language_detection(text:str)-> str: """Detects the language of the input text using basic xlm-roberta-base-language-detection. Args: text: the input message or wording to detect language from. """ model_ckpt = "papluca/xlm-roberta-base-language-detection" pipe = pipeline("text-classification", model=model_ckpt) preds = pipe(text, return_all_scores=True, truncation=True, max_length=128) if preds: pred = preds[0] language_probabilities_dict = {p["label"]: float(p["score"]) for p in pred} predicted_language_code = max(language_probabilities_dict, key=language_probabilities_dict.get) tool_prediction_confidence = language_probabilities_dict[predicted_language_code] confidence_str = f"Tool Confidence: {tool_prediction_confidence}" predicted_language_code_str = f"Predicted language code (ISO 639): {predicted_language_code}/n{confidence_str}" try: predicted_language = pycountry.languages.get(alpha_2=predicted_language_code) if predicted_language: predicted_language_str = f"Predicted language: {predicted_language.name}/n{confidence_str}" return predicted_language_str return predicted_language_code_str except Exception as e: return f"Error mapping country code to name (pycountry): {str(e)}/n{predicted_language_code_str}" else: return "None" @tool def advanced_image_generation(description:str)->Image.Image: """Generates an image using a textual description. Args: description: the textual description provided by the user to prompt a text-to-image model """ llm = OpenAI(temperature=0.9) prompt = PromptTemplate( input_variables=["image_desc"], template="Generate a detailed but short prompt (must be less than 900 characters) to generate an image based on the following description: {image_desc}", ) chain = LLMChain(llm=llm, prompt=prompt) image_url = DallEAPIWrapper().run(chain.run(description)) image_array = io.imread(image_url) pil_image = Image.fromarray(image_array) return pil_image @tool def calculate_cargo_travel_time( origin_coords: Tuple[float, float], destination_coords: Tuple[float, float], cruising_speed_kmh: Optional[float] = 750.0, # Average speed for cargo planes ) -> float: """ Calculate the travel time for a cargo plane between two points on Earth using great-circle distance. Args: origin_coords: Tuple of (latitude, longitude) for the starting point destination_coords: Tuple of (latitude, longitude) for the destination cruising_speed_kmh: Optional cruising speed in km/h (defaults to 750 km/h for typical cargo planes) Returns: float: The estimated travel time in hours Example: >>> # Chicago (41.8781° N, 87.6298° W) to Sydney (33.8688° S, 151.2093° E) >>> result = calculate_cargo_travel_time((41.8781, -87.6298), (-33.8688, 151.2093)) """ def to_radians(degrees: float) -> float: return degrees * (math.pi / 180) # Extract coordinates lat1, lon1 = map(to_radians, origin_coords) lat2, lon2 = map(to_radians, destination_coords) # Earth's radius in kilometers EARTH_RADIUS_KM = 6371.0 # Calculate great-circle distance using the haversine formula dlon = lon2 - lon1 dlat = lat2 - lat1 a = ( math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2 ) c = 2 * math.asin(math.sqrt(a)) distance = EARTH_RADIUS_KM * c # Add 10% to account for non-direct routes and air traffic controls actual_distance = distance * 1.1 # Calculate flight time # Add 1 hour for takeoff and landing procedures flight_time = (actual_distance / cruising_speed_kmh) + 1.0 # Format the results return round(flight_time, 2) @tool def browser_automation(original_user_query:str)->str: """ Browser automation is like “simulating a real user” and works for interactive, dynamic sites and when visual navigation is required to show the process to the user. Navigates the web using helium to answer a user query by appending helium_instructions to the original query by searching for text matches through the navigation. Args: original_user_query: The original """ # Use sys.executable to ensure the same Python interpreter is used. result = subprocess.run( [sys.executable, "vision_web_browser.py", original_user_query], capture_output=True, # Captures both stdout and stderr text=True # Returns output as a string instead of bytes ) print("vision_web_browser.py: ", result.stderr) return result.stdout text_to_speech_pipe = pipeline( task="text-to-speech", model="suno/bark-small", device = 0 if torch.cuda.is_available() else "cpu", torch_dtype=torch.float16, ) text_to_speech_pipe.model.enable_cpu_offload() text_to_speech_pipe.model.use_flash_attention_2=True text_to_speech_pipe.model.pad_token_id=0 # 50257 tokenizer = AutoTokenizer.from_pretrained("suno/bark-small") #print("suno/bark-small tokenizer pad_token_id: ", tokenizer.pad_token_id) # 0 #print("suno/bark-small tokenizer eos_token_id: ", tokenizer.eos_token_id) # none text_to_speech_pipe.model.pad_token_id = tokenizer.pad_token_id text_to_speech_pipe.model.eos_token_id = tokenizer.eos_token_id def speech_to_text(final_answer_text, agent_memory): text = f"[clears throat] Here is the final answer: {final_answer_text}" # attention_mask = [1] * len(text.split()) # Create an attention mask for your text # Run the pipeline with the attention mask output = text_to_speech_pipe(text) # display(Audio(output["audio"], rate=output["sampling_rate"])) # notebook audio = np.array(output["audio"], dtype=np.float32) print("Original audio shape:", audio.shape) # Adjust audio shape if necessary: if audio.ndim == 1: # Mono audio, should be fine. You can check if your device expects stereo. print("Mono audio... should be fine. You can check if your device expects stereo.") elif audio.ndim == 2: # Check if the number of channels is acceptable (e.g., 1 or 2) channels = audio.shape[1] if channels not in [1, 2]: # Try to squeeze extra dimensions audio = np.squeeze(audio) print("Squeezed audio shape:", audio.shape) else: # If audio has more dimensions than expected, flatten or reshape as needed audio = np.squeeze(audio) print("Squeezed audio shape:", audio.shape) # Play the audio using sounddevice try: sd.play(audio, output["sampling_rate"]) sd.wait() # Wait until audio playback is complete except Exception as e: print(f"Error playing audio: {e}") return True def initialize_langfuse_opentelemetry_instrumentation(): LANGFUSE_PUBLIC_KEY=os.environ.get("LANGFUSE_PUBLIC_KEY") LANGFUSE_SECRET_KEY=os.environ.get("LANGFUSE_SECRET_KEY") LANGFUSE_AUTH=base64.b64encode(f"{LANGFUSE_PUBLIC_KEY}:{LANGFUSE_SECRET_KEY}".encode()).decode() os.environ["OTEL_EXPORTER_OTLP_ENDPOINT"] = "https://cloud.langfuse.com/api/public/otel" # EU data region os.environ["OTEL_EXPORTER_OTLP_HEADERS"] = f"Authorization=Basic {LANGFUSE_AUTH}" trace_provider = TracerProvider() trace_provider.add_span_processor(SimpleSpanProcessor(OTLPSpanExporter())) SmolagentsInstrumentor().instrument(tracer_provider=trace_provider) # telemetry initialize_langfuse_opentelemetry_instrumentation() # load tools from /tools/ final_answer = FinalAnswerTool() visit_webpage = VisitWebpageTool() translation = TranslationTool() best_model_for_task = HFModelDownloadsTool() transformers_retriever = retriever_tool # load tools from smoloagents library google_web_search = GoogleSearchTool() # provider="serper" (SERPER_API_KEY) or "serpapi" (default) google_web_search.name = "google_web_search" duckduckgo_web_search = DuckDuckGoSearchTool() duckduckgo_web_search.name = "duckduckgo_web_search" # load tools from hub and langchain # image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True) image_generation_tool = load_tool("m-ric/text-to-image", trust_remote_code=True) # Tool.from_space("black-forest-labs/FLUX.1-schnell", name="image_generator", description="Generate an image from a prompt") advanced_search_tool = Tool.from_langchain(load_tools(["searchapi"], allow_dangerous_tools=True)[0]) # serpapi is not real time scrapping advanced_search_tool.name = "advanced_search_tool" image_generation_tool_fast = Tool.from_space( "black-forest-labs/FLUX.1-schnell", name="image_generator", description="Generate an image from a prompt" ) ceo_model = load_model("LiteLLMModel", "gpt-4o") # or anthropic/claude-3-sonnet """ ceo_model = HfApiModel( max_tokens=2096, # 8096 for manager temperature=0.5, model_id= 'https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # "meta-llama/Llama-3.3-70B-Instruct", # 'https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # same as Qwen/Qwen2.5-Coder-32B-Instruct custom_role_conversions=None, ) """ with open("prompts.yaml", 'r') as stream: prompt_templates = yaml.safe_load(stream) tools = [ final_answer, best_model_for_task, advanced_search_tool, google_web_search, duckduckgo_web_search, visit_webpage, browser_automation, get_current_time_in_timezone, advanced_image_generation, image_generation_tool, transformers_retriever, language_detection, translation, calculate_cargo_travel_time ] agent = CodeAgent( model=ceo_model, tools=tools, max_steps=20, # 15 is good for a light manager, too much when there is no need of a manager verbosity_level=2, grammar=None, # planning_interval=5, # (add more steps for heavier reasoning, leave default if not manager) # test for crashing issues. name="Alfredo", description="CEO", prompt_templates=prompt_templates, # executor_type="e2b", # security, could also be "docker" (set keys) # sandbox=E2BSandbox() (or E2BExecutor?), # step_callbacks=[save_screenshot], # todo: configure the web_navigation agent as a separate agent and manage it with alfred final_answer_checks=[speech_to_text], additional_authorized_imports=[ "geopandas", "plotly", "shapely", "json", "pandas", "numpy", "requests", "helium", "bs4" ], # I could also add the authorized_imports from a LIST_SAFE_MODULES ) agent.python_executor("from helium import *") # agent.state # agent.push_to_hub('laverdes/Alfredo') agent.visualize() # prompt = ("navigate to a random wikipedia page and give me a summary of the content, then make a single image representing all the content") # agent.run(prompt) GradioUI(agent).launch()