|
#!/bin/bash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if [ -n "$DEBUG" ]; then
|
|
set -e
|
|
fi
|
|
|
|
|
|
|
|
|
|
AGENT_LLM_CONFIG="agent"
|
|
|
|
|
|
|
|
|
|
|
|
ENV_LLM_CONFIG="env"
|
|
|
|
|
|
OUTPUTS_PATH="outputs"
|
|
|
|
|
|
|
|
SERVER_HOSTNAME="localhost"
|
|
|
|
|
|
|
|
|
|
|
|
VERSION="1.0.0"
|
|
|
|
|
|
while [[ $# -gt 0 ]]; do
|
|
case "$1" in
|
|
--agent-llm-config)
|
|
AGENT_LLM_CONFIG="$2"
|
|
shift 2
|
|
;;
|
|
--env-llm-config)
|
|
ENV_LLM_CONFIG="$2"
|
|
shift 2
|
|
;;
|
|
--outputs-path)
|
|
OUTPUTS_PATH="$2"
|
|
shift 2
|
|
;;
|
|
--server-hostname)
|
|
SERVER_HOSTNAME="$2"
|
|
shift 2
|
|
;;
|
|
--version)
|
|
VERSION="$2"
|
|
shift 2
|
|
;;
|
|
*)
|
|
echo "Unknown argument: $1"
|
|
exit 1
|
|
;;
|
|
esac
|
|
done
|
|
|
|
|
|
if [[ ! "$OUTPUTS_PATH" = /* ]]; then
|
|
|
|
OUTPUTS_PATH="$(cd "$(dirname "$OUTPUTS_PATH")" 2>/dev/null && pwd)/$(basename "$OUTPUTS_PATH")"
|
|
fi
|
|
|
|
echo "Using agent LLM config: $AGENT_LLM_CONFIG"
|
|
echo "Using environment LLM config: $ENV_LLM_CONFIG"
|
|
echo "Outputs path: $OUTPUTS_PATH"
|
|
echo "Server hostname: $SERVER_HOSTNAME"
|
|
echo "Version: $VERSION"
|
|
|
|
echo "Downloading tasks.md..."
|
|
rm -f tasks.md
|
|
wget https://github.com/TheAgentCompany/TheAgentCompany/releases/download/${VERSION}/tasks.md
|
|
|
|
while IFS= read -r task_image; do
|
|
docker pull $task_image
|
|
|
|
|
|
task_name=${task_image##ghcr.io/theagentcompany/}
|
|
|
|
|
|
task_name=${task_name%-image:*}
|
|
echo "Use task image $task_image, task name $task_name..."
|
|
|
|
|
|
if [ -f "$OUTPUTS_PATH/eval_${task_name}-image.json" ]; then
|
|
echo "Skipping $task_name - evaluation file already exists"
|
|
continue
|
|
fi
|
|
|
|
export PYTHONPATH=evaluation/benchmarks/the_agent_company:\$PYTHONPATH && \
|
|
poetry run python run_infer.py \
|
|
--agent-llm-config "$AGENT_LLM_CONFIG" \
|
|
--env-llm-config "$ENV_LLM_CONFIG" \
|
|
--outputs-path "$OUTPUTS_PATH" \
|
|
--server-hostname "$SERVER_HOSTNAME" \
|
|
--task-image-name "$task_image"
|
|
|
|
|
|
docker image rm "$task_image"
|
|
docker images "ghcr.io/all-hands-ai/runtime" -q | xargs -r docker rmi -f
|
|
docker volume prune -f
|
|
docker system prune -f
|
|
done < tasks.md
|
|
|
|
rm tasks.md
|
|
|
|
echo "All evaluation completed successfully!"
|
|
|