Spaces:

mozilla-ai
/

surf-spot-finder

Running

App Files Files Community

github-actions[bot] commited on 25 days ago

Commit

27f8cfc

1 Parent(s): 980d57f

Sync with https://github.com/mozilla-ai/surf-spot-finder

Browse files

Files changed (7) hide show

app.py +122 -12
components/__init__.py +0 -0
components/inputs.py +156 -0
components/sidebar.py +9 -0
constants.py +52 -0
services/__init__.py +0 -0
services/agent.py +168 -0

app.py CHANGED Viewed

@@ -1,39 +1,149 @@
 import streamlit as st
 import asyncio
 import nest_asyncio
-from pickers import get_user_inputs
-from utils import run_agent
 nest_asyncio.apply()
 # Set page config
 st.set_page_config(page_title="Surf Spot Finder", page_icon="🏄", layout="wide")
-# Add title and header
-st.title("🏄 Surf Spot Finder")
 st.markdown(
-    "Find the best surfing spots based on your location and preferences! [Github Repo](https://github.com/mozilla-ai/surf-spot-finder)"
 )
-# Sidebar
 with st.sidebar:
-    st.markdown("### Configuration")
-    st.markdown("Built using [Any-Agent](https://github.com/mozilla-ai/any-agent)")
-    user_inputs = get_user_inputs()
     is_valid = user_inputs is not None
-    run_button = st.button("Run", disabled=not is_valid, type="primary")
 # Main content
 async def main():
     if run_button:
-        await run_agent(user_inputs)
     else:
         st.info(
             "👈 Configure your search parameters in the sidebar and click Run to start!"
         )
 if __name__ == "__main__":
     loop = asyncio.new_event_loop()

+from components.sidebar import ssf_sidebar
+from constants import DEFAULT_TOOLS
 import streamlit as st
 import asyncio
 import nest_asyncio
+from services.agent import (
+    configure_agent,
+    display_evaluation_results,
+    display_output,
+    evaluate_agent,
+    run_agent,
+)
 nest_asyncio.apply()
 # Set page config
 st.set_page_config(page_title="Surf Spot Finder", page_icon="🏄", layout="wide")
+# Allow a user to resize the sidebar to take up most of the screen to make editing eval cases easier
 st.markdown(
+    """
+    <style>
+        /* When sidebar is expanded, adjust main content */
+        section[data-testid="stSidebar"][aria-expanded="true"] {
+            max-width: 99% !important;
+        }
+    </style>
+""",
+    unsafe_allow_html=True,
 )
 with st.sidebar:
+    user_inputs = ssf_sidebar()
     is_valid = user_inputs is not None
+    run_button = st.button("Run Agent 🤖", disabled=not is_valid, type="primary")
 # Main content
 async def main():
+    # Handle agent execution button click
     if run_button:
+        agent, agent_config = await configure_agent(user_inputs)
+        agent_trace, execution_time = await run_agent(agent, agent_config)
+        await display_output(agent_trace, execution_time)
+        evaluation_result = await evaluate_agent(agent_config, agent_trace)
+        await display_evaluation_results(evaluation_result)
     else:
+        st.title("🏄 Surf Spot Finder")
+        st.markdown(
+            "Find the best surfing spots based on your location and preferences! [Github Repo](https://github.com/mozilla-ai/surf-spot-finder)"
+        )
         st.info(
             "👈 Configure your search parameters in the sidebar and click Run to start!"
         )
+        # Display tools in a more organized way
+        st.markdown("### 🛠️ Available Tools")
+        st.markdown("""
+        The AI Agent built for this project has a few tools available for use in order to find the perfect surf spot.
+        The agent is given the freedom to use (or not use) these tools in order to accomplish the task.
+        """)
+        weather_tools = [
+            tool
+            for tool in DEFAULT_TOOLS
+            if "forecast" in tool.__name__ or "weather" in tool.__name__
+        ]
+        for tool in weather_tools:
+            with st.expander(f"🌤️ {tool.__name__}"):
+                st.markdown(tool.__doc__ or "No description available")
+        location_tools = [
+            tool
+            for tool in DEFAULT_TOOLS
+            if "lat" in tool.__name__
+            or "lon" in tool.__name__
+            or "area" in tool.__name__
+        ]
+        for tool in location_tools:
+            with st.expander(f"📍 {tool.__name__}"):
+                st.markdown(tool.__doc__ or "No description available")
+        web_tools = [
+            tool
+            for tool in DEFAULT_TOOLS
+            if "web" in tool.__name__ or "search" in tool.__name__
+        ]
+        for tool in web_tools:
+            with st.expander(f"🌐 {tool.__name__}"):
+                st.markdown(tool.__doc__ or "No description available")
+        # add a check that all tools were listed
+        if len(weather_tools) + len(location_tools) + len(web_tools) != len(
+            DEFAULT_TOOLS
+        ):
+            st.warning(
+                "Some tools are not listed. Please check the code for more details."
+            )
+        # Add Custom Evaluation explanation section
+        st.markdown("### 📊 Custom Evaluation")
+        st.markdown("""
+        The Surf Spot Finder includes a powerful evaluation system that allows you to customize how the agent's performance is assessed.
+        You can find these settings in the sidebar under the "Custom Evaluation" expander.
+        """)
+        with st.expander("Learn more about Custom Evaluation"):
+            st.markdown("""
+            #### What is Custom Evaluation?
+            The Custom Evaluation feature uses an LLM-as-a-Judge approach to evaluate how well the agent performs its task.
+            An LLM will be given the complete agent trace (not just the final answer), and will assess the agent's performance based on the criteria you set.
+            You can customize:
+            - **Evaluation Model**: Choose which LLM should act as the judge
+            - **Evaluation Criteria**: Define specific checkpoints that the agent should meet
+            - **Scoring System**: Assign points to each criterion
+            #### How to Use Custom Evaluation
+            1. **Select an Evaluation Model**: Choose which LLM you want to use as the judge
+            2. **Edit Checkpoints**: Use the data editor to:
+               - Add new evaluation criteria
+               - Modify existing criteria
+               - Adjust point values
+               - Remove criteria you don't want to evaluate
+            #### Example Criteria
+            You can evaluate things like:
+            - Tool usage and success
+            - Order of operations
+            - Quality of final recommendations
+            - Response completeness
+            - Number of steps taken
+            #### Tips for Creating Good Evaluation Criteria
+            - Be specific about what you want to evaluate
+            - Use clear, unambiguous language
+            - Consider both process (how the agent works) and outcome (what it produces)
+            - Assign appropriate point values based on importance
+            The evaluation results will be displayed after each agent run, showing how well the agent met your custom criteria.
+            """)
 if __name__ == "__main__":
     loop = asyncio.new_event_loop()

components/__init__.py ADDED Viewed

File without changes

components/inputs.py ADDED Viewed

	@@ -0,0 +1,156 @@

+from datetime import datetime, timedelta
+import json
+import requests
+import streamlit as st
+from any_agent import AgentFramework
+from any_agent.tracing.trace import _is_tracing_supported
+from any_agent.evaluation import EvaluationCase
+from any_agent.evaluation.schemas import CheckpointCriteria
+import pandas as pd
+from constants import DEFAULT_EVALUATION_CASE, MODEL_OPTIONS
+from pydantic import BaseModel, ConfigDict
+class UserInputs(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+    model_id: str
+    location: str
+    max_driving_hours: int
+    date: datetime
+    framework: str
+    evaluation_case: EvaluationCase
+    run_evaluation: bool
+@st.cache_resource
+def get_area(area_name: str) -> dict:
+    """Get the area from Nominatim.
+    Uses the [Nominatim API](https://nominatim.org/release-docs/develop/api/Search/).
+    Args:
+        area_name (str): The name of the area.
+    Returns:
+        dict: The area found.
+    """
+    response = requests.get(
+        f"https://nominatim.openstreetmap.org/search?q={area_name}&format=json",
+        headers={"User-Agent": "Mozilla/5.0"},
+        timeout=5,
+    )
+    response.raise_for_status()
+    response_json = json.loads(response.content.decode())
+    return response_json
+def get_user_inputs() -> UserInputs:
+    default_val = "Los Angeles California, US"
+    col1, col2 = st.columns([3, 1])
+    with col1:
+        location = st.text_input("Enter a location", value=default_val)
+    with col2:
+        if location:
+            location_check = get_area(location)
+            if not location_check:
+                st.error("❌")
+            else:
+                st.success("✅")
+    max_driving_hours = st.number_input(
+        "Enter the maximum driving hours", min_value=1, value=2
+    )
+    col_date, col_time = st.columns([2, 1])
+    with col_date:
+        date = st.date_input(
+            "Select a date in the future", value=datetime.now() + timedelta(days=1)
+        )
+    with col_time:
+        # default to 9am
+        time = st.selectbox(
+            "Select a time",
+            [datetime.strptime(f"{i:02d}:00", "%H:%M").time() for i in range(24)],
+            index=9,
+        )
+    date = datetime.combine(date, time)
+    supported_frameworks = [
+        framework for framework in AgentFramework if _is_tracing_supported(framework)
+    ]
+    framework = st.selectbox(
+        "Select the agent framework to use",
+        supported_frameworks,
+        index=3,
+        format_func=lambda x: x.name,
+    )
+    model_id = st.selectbox(
+        "Select the model to use",
+        MODEL_OPTIONS,
+        index=0,
+        format_func=lambda x: "/".join(x.split("/")[-3:]),
+    )
+    # Add evaluation case section
+    with st.expander("Custom Evaluation"):
+        evaluation_model_id = st.selectbox(
+            "Select the model to use for LLM-as-a-Judge evaluation",
+            MODEL_OPTIONS,
+            index=2,
+            format_func=lambda x: "/".join(x.split("/")[-3:]),
+        )
+        evaluation_case = DEFAULT_EVALUATION_CASE
+        evaluation_case.llm_judge = evaluation_model_id
+        # make this an editable json section
+        # convert the checkpoints to a df series so that it can be edited
+        checkpoints = evaluation_case.checkpoints
+        checkpoints_df = pd.DataFrame(
+            [checkpoint.model_dump() for checkpoint in checkpoints]
+        )
+        checkpoints_df = st.data_editor(
+            checkpoints_df,
+            column_config={
+                "points": st.column_config.NumberColumn(label="Points"),
+                "criteria": st.column_config.TextColumn(label="Criteria"),
+            },
+            hide_index=True,
+            num_rows="dynamic",
+        )
+        # for each checkpoint, convert it back to a CheckpointCriteria object
+        new_ckpts = []
+        # don't let a user add more than 20 checkpoints
+        if len(checkpoints_df) > 20:
+            st.error(
+                "You can only add up to 20 checkpoints for the purpose of this demo."
+            )
+            checkpoints_df = checkpoints_df[:20]
+        for _, row in checkpoints_df.iterrows():
+            if row["criteria"] == "":
+                continue
+            try:
+                # Don't let people write essays for criteria in this demo
+                if len(row["criteria"].split(" ")) > 100:
+                    raise ValueError("Criteria is too long")
+                new_crit = CheckpointCriteria(
+                    criteria=row["criteria"], points=row["points"]
+                )
+                new_ckpts.append(new_crit)
+            except Exception as e:
+                st.error(f"Error creating checkpoint: {e}")
+        evaluation_case.checkpoints = new_ckpts
+    return UserInputs(
+        model_id=model_id,
+        location=location,
+        max_driving_hours=max_driving_hours,
+        date=date,
+        framework=framework,
+        evaluation_case=evaluation_case,
+        run_evaluation=st.checkbox("Run Evaluation", value=True),
+    )

components/sidebar.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from components.inputs import UserInputs, get_user_inputs
+import streamlit as st
+def ssf_sidebar() -> UserInputs:
+    st.markdown("### Configuration")
+    st.markdown("Built using [Any-Agent](https://github.com/mozilla-ai/any-agent)")
+    user_inputs = get_user_inputs()
+    return user_inputs

constants.py CHANGED Viewed

@@ -1,3 +1,11 @@
 MODEL_OPTIONS = [
     # "huggingface/novita/deepseek-ai/DeepSeek-V3",
     # "huggingface/novita/meta-llama/Llama-3.3-70B-Instruct",
@@ -13,3 +21,47 @@ MODEL_OPTIONS = [
 # Hugginface API Provider Error:
 # Must alternate between assistant/user, which meant that the 'tool' role made it puke

+from any_agent.evaluation import EvaluationCase
+from surf_spot_finder.tools import (
+    get_area_lat_lon,
+    get_wave_forecast,
+    get_wind_forecast,
+)
+from any_agent.tools.web_browsing import search_web, visit_webpage
 MODEL_OPTIONS = [
     # "huggingface/novita/deepseek-ai/DeepSeek-V3",
     # "huggingface/novita/meta-llama/Llama-3.3-70B-Instruct",
 # Hugginface API Provider Error:
 # Must alternate between assistant/user, which meant that the 'tool' role made it puke
+DEFAULT_EVALUATION_CASE = EvaluationCase(
+    llm_judge=MODEL_OPTIONS[0],
+    checkpoints=[
+        {
+            "criteria": "Check if the agent considered at least three surf spot options",
+            "points": 1,
+        },
+        {
+            "criteria": "Check if the agent gathered wind and wave forecasts for each surf spot being evaluated.",
+            "points": 1,
+        },
+        {
+            "criteria": "Check if the agent used any web search tools to explore which surf spots should be considered",
+            "points": 1,
+        },
+        {
+            "criteria": "Check if the final answer contains any description about the weather (air temp, chance of rain, etc) at the chosen location",
+            "points": 1,
+        },
+        {
+            "criteria": "Check if the final answer includes one of the surf spots evaluated by tools",
+            "points": 1,
+        },
+        {
+            "criteria": "Check if the final answer includes information about some alternative surf spots if the user is not satisfied with the chosen one",
+            "points": 1,
+        },
+        {
+            "criteria": "Check that the agent completed in fewer than 10 calls",
+            "points": 1,
+        },
+    ],
+)
+DEFAULT_TOOLS = [
+    get_wind_forecast,
+    get_wave_forecast,
+    get_area_lat_lon,
+    search_web,
+    visit_webpage,
+]

services/__init__.py ADDED Viewed

File without changes

services/agent.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import json
+from components.inputs import UserInputs
+from constants import DEFAULT_TOOLS
+import streamlit as st
+import time
+from surf_spot_finder.config import Config
+from any_agent import AgentConfig, AnyAgent, TracingConfig
+from any_agent.tracing.trace import AgentTrace, TotalTokenUseAndCost
+from any_agent.tracing.otel_types import StatusCode
+from any_agent.evaluation import evaluate, TraceEvaluationResult
+async def display_evaluation_results(result: TraceEvaluationResult):
+    all_results = (
+        result.checkpoint_results
+        + result.hypothesis_answer_results
+        + result.direct_results
+    )
+    # Create columns for better layout
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("#### Criteria Results")
+        for checkpoint in all_results:
+            if checkpoint.passed:
+                st.success(f"✅ {checkpoint.criteria}")
+            else:
+                st.error(f"❌ {checkpoint.criteria}")
+    with col2:
+        st.markdown("#### Overall Score")
+        total_points = sum([result.points for result in all_results])
+        if total_points == 0:
+            msg = "Total points is 0, cannot calculate score."
+            raise ValueError(msg)
+        passed_points = sum([result.points for result in all_results if result.passed])
+        # Create a nice score display
+        st.markdown(f"### {passed_points}/{total_points}")
+        percentage = (passed_points / total_points) * 100
+        st.progress(percentage / 100)
+        st.markdown(f"**{percentage:.1f}%**")
+@st.cache_resource
+async def evaluate_agent(
+    config: Config, agent_trace: AgentTrace
+) -> TraceEvaluationResult:
+    assert (
+        len(config.evaluation_cases) == 1
+    ), "Only one evaluation case is supported in the demo"
+    st.markdown("### 📊 Evaluation Results")
+    with st.spinner("Evaluating results..."):
+        case = config.evaluation_cases[0]
+        result: TraceEvaluationResult = evaluate(
+            evaluation_case=case,
+            trace=agent_trace,
+            agent_framework=config.framework,
+        )
+    return result
+async def configure_agent(user_inputs: UserInputs) -> tuple[AnyAgent, Config]:
+    if "huggingface" in user_inputs.model_id:
+        model_args = {
+            "extra_headers": {"X-HF-Bill-To": "mozilla-ai"},
+            "temperature": 0.0,
+        }
+    else:
+        model_args = {}
+    agent_config = AgentConfig(
+        model_id=user_inputs.model_id,
+        model_args=model_args,
+        tools=DEFAULT_TOOLS,
+    )
+    config = Config(
+        location=user_inputs.location,
+        max_driving_hours=user_inputs.max_driving_hours,
+        date=user_inputs.date,
+        framework=user_inputs.framework,
+        main_agent=agent_config,
+        managed_agents=[],
+        evaluation_cases=[user_inputs.evaluation_case],
+    )
+    agent = await AnyAgent.create_async(
+        agent_framework=config.framework,
+        agent_config=config.main_agent,
+        managed_agents=config.managed_agents,
+        tracing=TracingConfig(console=True, cost_info=True),
+    )
+    return agent, config
+async def display_output(agent_trace: AgentTrace, execution_time: float):
+    cost: TotalTokenUseAndCost = agent_trace.get_total_cost()
+    with st.expander("### 🏄 Results", expanded=True):
+        time_col, cost_col, tokens_col = st.columns(3)
+        with time_col:
+            st.info(f"⏱️ Execution Time: {execution_time:.2f} seconds")
+        with cost_col:
+            st.info(f"💰 Estimated Cost: ${cost.total_cost:.6f}")
+        with tokens_col:
+            st.info(f"📦 Total Tokens: {cost.total_tokens:,}")
+        st.markdown("#### Final Output")
+        st.info(agent_trace.final_output)
+    # Display the agent trace in a more organized way
+    with st.expander("### 🧩 Agent Trace"):
+        for span in agent_trace.spans:
+            # Header with name and status
+            col1, col2 = st.columns([4, 1])
+            with col1:
+                st.markdown(f"**{span.name}**")
+                if span.attributes:
+                    # st.json(span.attributes, expanded=False)
+                    if "input.value" in span.attributes:
+                        try:
+                            input_value = json.loads(span.attributes["input.value"])
+                            if isinstance(input_value, list) and len(input_value) > 0:
+                                st.write(f"Input: {input_value[-1]}")
+                            else:
+                                st.write(f"Input: {input_value}")
+                        except Exception:  # noqa: E722
+                            st.write(f"Input: {span.attributes['input.value']}")
+                    if "output.value" in span.attributes:
+                        try:
+                            output_value = json.loads(span.attributes["output.value"])
+                            if isinstance(output_value, list) and len(output_value) > 0:
+                                st.write(f"Output: {output_value[-1]}")
+                            else:
+                                st.write(f"Output: {output_value}")
+                        except Exception:  # noqa: E722
+                            st.write(f"Output: {span.attributes['output.value']}")
+            with col2:
+                status_color = (
+                    "green" if span.status.status_code == StatusCode.OK else "red"
+                )
+                st.markdown(
+                    f"<span style='color: {status_color}'>● {span.status.status_code.name}</span>",
+                    unsafe_allow_html=True,
+                )
+@st.cache_resource
+async def run_agent(agent, config) -> tuple[AgentTrace, float]:
+    st.markdown("#### 🔍 Running Surf Spot Finder with query")
+    query = config.input_prompt_template.format(
+        LOCATION=config.location,
+        MAX_DRIVING_HOURS=config.max_driving_hours,
+        DATE=config.date,
+    )
+    st.code(query, language="text")
+    start_time = time.time()
+    with st.spinner("🤔 Analyzing surf spots..."):
+        agent_trace: AgentTrace = await agent.run_async(query)
+        agent.exit()
+    end_time = time.time()
+    execution_time = end_time - start_time
+    return agent_trace, execution_time