Which Agent is Best?

In [None]:
import os
from surf_spot_finder.utils.logging import get_logger
import pandas as pd

logger = get_logger()


def load_results():
    results_path = "./results.json"
    if not os.path.exists(results_path):
        logger.info("No results found, skipping loading.")
        return pd.DataFrame()
    df = pd.read_json(results_path, lines=True)
    return df

In [None]:
# Need nest_asyncio to run the evaluation in a notebook
from surf_spot_finder.evaluation.main import evaluate
import nest_asyncio

nest_asyncio.apply()

test_case_path = "./test_cases/alpha.yaml"
configs = [
    "langchain-4o",
    "openai-4o",
    "smolagents-4o",
    "smolagents-4o-mini",
    "openai-4o-mini",
    "smolagents-o3-mini",
    "openai-o3-mini",
    "smolagents-o1",
    "openai-o1",
    "smolagents-ollama-llama3.1-8b-q4",
    "smolagents-ollama-llama3.1-8b-fp16",
]
results_df = load_results()
for agent in configs:
    agent_config_path = f"./agent_configs/{agent}.yaml"
    # check if the agent config is already in the results
    if (
        not results_df.empty
        and results_df[results_df["agent_config_path"] == agent_config_path].shape[0]
        > 0
    ):
        logger.info(f"Already evaluated {agent}")
        continue
    logger.info(f"Evaluating {agent}")
    evaluate(
        test_case_path=test_case_path,
        agent_config_path=agent_config_path,
        telemetry_path=None,
    )

In [9]:
df = load_results()
# print out the score and config path columns
logger.info("==========================")
logger.info("Summary:")
df = df.sort_values(by="score", ascending=False)
summary = df[["agent_config_path", "score"]]
logger.info(summary.to_string(index=False))
logger.info("==========================")

# sort the df by score

# for each row, print out the score
for index, row in df.iterrows():
    logger.info(f"Agent config: {row['agent_config_path']}")
    logger.info(row["output_message"])
    logger.info(f"Score: {row['score']}")



Summary:
                                    agent_config_path  score
                  ./agent_configs/openai-4o-mini.yaml  92.86
                       ./agent_configs/openai-o1.yaml  92.86
                       ./agent_configs/openai-4o.yaml  85.71
                   ./agent_configs/smolagents-4o.yaml  85.71
                  ./agent_configs/openai-o3-mini.yaml  85.71
                   ./agent_configs/smolagents-o1.yaml  85.71
                    ./agent_configs/langchain-4o.yaml  57.14
              ./agent_configs/smolagents-4o-mini.yaml  57.14
              ./agent_configs/smolagents-o3-mini.yaml  50.00
./agent_configs/smolagents-ollama-llama3.1-8b-q4.yaml   0.00
Agent config: ./agent_configs/openai-4o-mini.yaml
[33mHypothesis Final answer extracted: ### Surf Location: T Street Beach

- **Coordinates:** 33.416044, -117.617257  
- **Wave Height:** 1.34 meters  
- **Wave Direction:** 256 degrees  
- **Wave Period:** 10.25 seconds  
- **Wind Speed:** 13.6 m/s from the SW (212 deg