XanderJC commited on
Commit
2cedb9d
·
1 Parent(s): dcaf5c9

default endpoint

Browse files
src/proxy_lite/agents/__init__.py CHANGED
@@ -1,7 +1,6 @@
1
  from typing import Union
2
 
3
  from .agent_base import Agents, BaseAgent, BaseAgentConfig
4
- from .browser_agent import BrowserAgent, BrowserAgentConfig
5
  from .proxy_lite_agent import ProxyLiteAgent, ProxyLiteAgentConfig
6
 
7
  AgentTypes = Union[*list(Agents._agent_registry.values())]
@@ -14,8 +13,6 @@ __all__ = [
14
  "Agents",
15
  "BaseAgent",
16
  "BaseAgentConfig",
17
- "BrowserAgent",
18
- "BrowserAgentConfig",
19
  "ProxyLiteAgent",
20
  "ProxyLiteAgentConfig",
21
  ]
 
1
  from typing import Union
2
 
3
  from .agent_base import Agents, BaseAgent, BaseAgentConfig
 
4
  from .proxy_lite_agent import ProxyLiteAgent, ProxyLiteAgentConfig
5
 
6
  AgentTypes = Union[*list(Agents._agent_registry.values())]
 
13
  "Agents",
14
  "BaseAgent",
15
  "BaseAgentConfig",
 
 
16
  "ProxyLiteAgent",
17
  "ProxyLiteAgentConfig",
18
  ]
src/proxy_lite/agents/browser_agent.py DELETED
@@ -1,133 +0,0 @@
1
- from datetime import datetime
2
- from functools import cached_property
3
- from typing import Literal
4
-
5
- from pydantic import Field
6
-
7
- from proxy_lite.agents.agent_base import Agents, BaseAgent, BaseAgentConfig
8
- from proxy_lite.history import MessageHistory, MessageLabel, SystemMessage, Text
9
- from proxy_lite.tools import Tool
10
-
11
- BROWSER_AGENT_SYSTEM_PROMPT = """ **You are Proxy Lite, the Web-Browsing Agent.** You are developed by Convergence.
12
-
13
- **Current date:** {date_time_with_day}.
14
-
15
- You are given:
16
-
17
- 1. A user task that you are trying to complete.
18
- 2. Relevant facts we have at our disposal.
19
- 3. A high level plan to complete the task.
20
- 4. A history of previous actions and observations.
21
- 5. An annotated webpage screenshot and text description of what's visible in the browser before and after the last action.
22
-
23
- ## Objective
24
-
25
- You are an expert at controlling the web browser.
26
- You will be assisting a user with a task they are trying to complete on the web.
27
-
28
- ## Web Screenshots
29
-
30
- Each iteration of your browsing loop, you'll be provided with a screenshot of the browser.
31
-
32
- The screenshot will have red rectangular annotations. These annotations highlight the marked elements you can interact with.
33
-
34
- ## Mark IDs
35
-
36
- Each annotated element is labeled with a "mark id" in the top-left corner.
37
-
38
- When using tools like typing or clicking, specify the "mark id" to indicate which element you want to interact with.
39
-
40
- If an element is not annotated, you cannot interact with it. This is a limitation of the software. Focus on marked elements only.
41
-
42
- ## Text Snippets
43
-
44
- Along with the screenshot, you will receive text snippets describing each annotated element.
45
-
46
- Here’s an example of different element types:
47
-
48
- - [0] `<a>text</a>` → Mark 0 is a link (`<a>` tag) containing the text "text".
49
- - [1] `<button>text</button>` → Mark 1 is a button (`<button>` tag) containing the text "text".
50
- - [2] `<input value="text"/>` → Mark 2 is an input field (`<input>` tag) with the value "text".
51
- - [3] `<select>text</select>` → Mark 3 is a dropdown menu (`<select>` tag) with the option "text" selected.
52
- - [4] `<textarea>text</textarea>` → Mark 4 is a text area (`<textarea>` tag) containing the text "text".
53
- - [5] `<li>text</li>` → Mark 5 is a list item (`<li>` tag) containing the text "text".
54
- - [6] `<div scrollable>text</div>` → Mark 6 is a division (`<div>` tag) containing the text "text" and is scrollable.
55
- - [7] `<td>text</td>` → Mark 7 is a table cell (`<td>` tag) containing the text "text".
56
-
57
- Note that these text snippets may be incomplete.
58
-
59
- ## History
60
-
61
- You will see your past actions and observations but not old annotated webpages.
62
-
63
- This means annotated webpages showing useful information will not be visible in future actions.
64
-
65
- To get around this, key details from each webpage are stored in observations.
66
-
67
- ## Web Browser Actions
68
-
69
- You can only take the following actions with the web browser:
70
- {tool_descriptions}
71
-
72
- ## Important Browsing Tips
73
-
74
- If there is a modal overlay that is unresponsive on the page try reloading the webpage.
75
-
76
- If there is a cookie consent form covering part of the page just click accept on the form.
77
-
78
- When typing into a text field be sure to click one of the dropdown options (when present). Not selecting a dropdown option will result in the field being cleared after the next action.
79
-
80
- You do not have access any internet accounts (outside of those provided by the user).
81
-
82
- The browser has a built in CAPTCHA solver, if you are asked to solve one just wait and it will be solved for you.
83
-
84
- ## Don't Repeat the Same Actions Continuously
85
-
86
- If you find yourself repeating an action without making progress, try another action.
87
-
88
- ## Task
89
-
90
- You will now be connected to the user, who will give you their task.""" # noqa: E501
91
-
92
- MAX_MESSAGES_FOR_CONTEXT_WINDOW = {
93
- MessageLabel.SCREENSHOT: 1,
94
- # MessageLabel.REASONING_INDUCTION: 1,
95
- # MessageLabel.FORMAT_INSTRUCTIONS: 1,
96
- # MessageLabel.ACTION: 1,
97
- }
98
-
99
-
100
- @Agents.register_agent_config("browser")
101
- class BrowserAgentConfig(BaseAgentConfig):
102
- name: Literal["browser"] = "browser"
103
- history_messages_limit: dict[MessageLabel, int] = Field(
104
- default_factory=lambda: MAX_MESSAGES_FOR_CONTEXT_WINDOW,
105
- )
106
-
107
-
108
- @Agents.register_agent("browser")
109
- class BrowserAgent(BaseAgent):
110
- config: BrowserAgentConfig
111
- message_label: MessageLabel = MessageLabel.AGENT_MODEL_RESPONSE
112
-
113
- def __init__(self, **data):
114
- super().__init__(**data)
115
-
116
- @property
117
- def system_prompt(self) -> str:
118
- return BROWSER_AGENT_SYSTEM_PROMPT.format(
119
- date_time_with_day=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
120
- tool_descriptions=self.tool_descriptions,
121
- memories="",
122
- )
123
-
124
- @cached_property
125
- def tools(self) -> list[Tool]:
126
- return self.env_tools
127
-
128
- async def get_history_view(self) -> MessageHistory:
129
- return MessageHistory(
130
- messages=[SystemMessage(content=[Text(text=self.system_prompt)])],
131
- ) + self.history.history_view(
132
- limits=self.config.history_messages_limit,
133
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/proxy_lite/configs/default.yaml CHANGED
@@ -13,8 +13,8 @@ solver:
13
  name: proxy_lite
14
  client:
15
  name: convergence
16
- model_id: convergence-ai/subset-distill-tools-7b-15-02-2025
17
- api_base: http://slurm1-a3nodeset-4-1:8002/v1
18
  local_view: true
19
  task_timeout: 1800
20
  verbose: true
 
13
  name: proxy_lite
14
  client:
15
  name: convergence
16
+ model_id: convergence-ai/proxy-lite
17
+ api_base: https://convergence-ai-demo-api.hf.space/v1
18
  local_view: true
19
  task_timeout: 1800
20
  verbose: true
src/proxy_lite/runner.py CHANGED
@@ -208,8 +208,8 @@ if __name__ == "__main__":
208
  "environment": {
209
  "name": "webbrowser",
210
  "homepage": "https://www.google.com",
211
- "viewport_width": 1920,
212
- "viewport_height": 1080,
213
  "screenshot_delay": 1,
214
  "headless": False,
215
  },
@@ -219,10 +219,8 @@ if __name__ == "__main__":
219
  "name": "proxy_lite",
220
  "client": {
221
  "name": "convergence",
222
- "model_id": "convergence-ai/all-distill-tools-7b-16-02-2025",
223
- "api_base": "http://slurm1-a3nodeset-4-1:8009/v1",
224
- # # "model_id": "Qwen/Qwen2.5-VL-3B-Instruct",
225
- # # "api_base": "http://0.0.0.0:8000/v1",
226
  },
227
  },
228
  },
@@ -236,10 +234,6 @@ if __name__ == "__main__":
236
  logger.info(f"🤖 [bold purple]Config:[/] {config}")
237
 
238
  runner = Runner(config=config)
239
- result = asyncio.run(
240
- runner.run(
241
- "Tell me the tesla stock price" # noqa: E501
242
- )
243
- )
244
  print(runner.run_result)
245
  print(runner.complete)
 
208
  "environment": {
209
  "name": "webbrowser",
210
  "homepage": "https://www.google.com",
211
+ "viewport_width": 1280,
212
+ "viewport_height": 1920,
213
  "screenshot_delay": 1,
214
  "headless": False,
215
  },
 
219
  "name": "proxy_lite",
220
  "client": {
221
  "name": "convergence",
222
+ "model_id": "convergence-ai/proxy-lite",
223
+ "api_base": "https://convergence-ai-demo-api.hf.space/v1",
 
 
224
  },
225
  },
226
  },
 
234
  logger.info(f"🤖 [bold purple]Config:[/] {config}")
235
 
236
  runner = Runner(config=config)
237
+ result = asyncio.run(runner.run("Tell me the tesla stock price."))
 
 
 
 
238
  print(runner.run_result)
239
  print(runner.complete)
src/proxy_lite/solvers/__init__.py CHANGED
@@ -4,7 +4,6 @@ from typing import Union
4
 
5
  from .simple_solver import SimpleSolver, SimpleSolverConfig
6
  from .solver_base import BaseSolver, BaseSolverConfig, Solvers
7
- from .structured_solver import StructuredSolver, StructuredSolverConfig
8
 
9
  SolverConfigTypes = Union[*Solvers._solver_config_registry.values()]
10
  SolverTypes = Union[*Solvers._solver_registry.values()]
@@ -15,8 +14,6 @@ __all__ = [
15
  "BaseSolverConfig",
16
  "SimpleSolver",
17
  "SimpleSolverConfig",
18
- "StructuredSolver",
19
- "StructuredSolverConfig",
20
  "SolverConfigTypes",
21
  "SolverTypes",
22
  "Solvers",
 
4
 
5
  from .simple_solver import SimpleSolver, SimpleSolverConfig
6
  from .solver_base import BaseSolver, BaseSolverConfig, Solvers
 
7
 
8
  SolverConfigTypes = Union[*Solvers._solver_config_registry.values()]
9
  SolverTypes = Union[*Solvers._solver_registry.values()]
 
14
  "BaseSolverConfig",
15
  "SimpleSolver",
16
  "SimpleSolverConfig",
 
 
17
  "SolverConfigTypes",
18
  "SolverTypes",
19
  "Solvers",
src/proxy_lite/solvers/structured_solver.py DELETED
@@ -1,178 +0,0 @@
1
- # ruff: noqa: E501
2
-
3
- from functools import cached_property
4
- from typing import Literal, Optional
5
-
6
- from pydantic import BaseModel, Field
7
-
8
- from proxy_lite.agents import AgentConfigTypes, Agents, BaseAgent
9
- from proxy_lite.environments.environment_base import Action, Observation
10
- from proxy_lite.history import (
11
- MessageHistory,
12
- MessageLabel,
13
- SystemMessage,
14
- )
15
- from proxy_lite.tools import Tool
16
-
17
- from .solver_base import BaseSolver, BaseSolverConfig, Solvers
18
-
19
- WEB_TOOL_TURN = """The browser action has been attempted. Please double check if the action was successful."""
20
- PLAN_USER_PROMPT = "First create a high-level plan to help solve the task on the web."
21
- ACTION_PROMPT = """Now take the most-promising next action in the browser.
22
-
23
- Only refer to the latest web elements from the latest screenshot.
24
-
25
- Using mark ids from older turns will lead to errors as they are no longer valid.
26
-
27
- Only interact with elements visible on the current webpage. Do not make up numbers or elements."""
28
- REASONING_PROMPT = """You will now follow these steps.
29
-
30
- 1. **Make observations about the state of the webpage**:
31
- - Consider the previous screenshot, your attempted previous action, and the current screenshot.
32
- - Describe any changes you observe, and try to determine if the previous action succeeded.
33
- - For example, if a form is being filled out, check whether the correct information is now displayed.
34
-
35
- 2. **Write down any helpful facts you have gathered**:
36
- - Describe any useful information on the webpage that might be helpful for completing the task.
37
- - For example, if you are viewing a document, you may wish to note down any information you want to refer back to later.
38
-
39
- 3. **Reason about the system's status**:
40
- - Have you fully completed the task?
41
-
42
- 4. **Select one of the following statuses**:
43
- - "complete": if the task has been completed.
44
- - "continue": if you are ready to continue without information or help.
45
-
46
- 5. **Reason through next steps**:
47
- - If the status is "continue", write down your reasoning for the next action you will take. You can only take one action at a time.
48
- - If the status is not "continue", return an empty string.
49
-
50
- 6. **Write a message to the user**:
51
- - If the status is "complete", write a message to the user. If they asked a question in the task, make sure the answer is here. Otherwise, just provide other useful information about how the task went or if there was a problem in completing it.
52
- - If the status is not "complete", set this to an empty string.
53
-
54
- Tips:
55
- - If you have already provided a response, don't provide it again.
56
- - If you notice you are repeating previous actions, you're likely stuck. Try something different."""
57
-
58
-
59
- class Reflection(BaseModel):
60
- observation: str = Field(
61
- ...,
62
- description="Observation of the current browser state, including an assessment on the success of the last action (previous actions and observations are often wrong).",
63
- )
64
- fact_updates: list[str] = Field(
65
- "",
66
- description="List of new information relevant to the task that was found on the page, ignore input fields holding content you wrote.",
67
- )
68
- status_reasoning: str = Field(
69
- ...,
70
- description="Reasoning about the current state of the task.",
71
- )
72
- status: Literal["complete", "continue"] = Field(
73
- ...,
74
- description="Choose a system status based on your status reasoning.",
75
- )
76
- next_step_reasoning: str = Field(
77
- ...,
78
- description='If status is "continue", reason through the next action you will be taking (do not repeat actions over and over). Otherwise set to "".',
79
- )
80
- ending_message: str = Field(
81
- ...,
82
- description="If status is 'complete', write a message to the user. If they asked a question in the task, make sure the answer is here. Otherwise, just provide other useful information about how the task went or if there was a problem in completing it. If status is 'continue', set to ''.",
83
- )
84
-
85
-
86
- @Solvers.register_solver_config("structured")
87
- class StructuredSolverConfig(BaseSolverConfig):
88
- name: Literal["structured"] = "structured"
89
- agent: AgentConfigTypes
90
- start_with_plan: bool = True
91
-
92
-
93
- @Solvers.register_solver("structured")
94
- class StructuredSolver(BaseSolver):
95
- task: Optional[str] = None
96
- complete: bool = False
97
-
98
- @cached_property
99
- def tools(self) -> list[Tool]:
100
- return self.env_tools
101
-
102
- @cached_property
103
- def local_tools(self) -> list[Tool]:
104
- if self.sandbox:
105
- return self.sandbox.tools
106
- return []
107
-
108
- @cached_property
109
- def agent(self) -> BaseAgent:
110
- self.logger.debug(f"Tools: {self.tools}")
111
- return Agents.get(self.config.agent.name)(
112
- config=self.config.agent,
113
- env_tools=self.tools,
114
- )
115
-
116
- @property
117
- def history(self) -> MessageHistory:
118
- return MessageHistory(
119
- messages=[SystemMessage.from_media(text=self.agent.system_prompt)] + self.agent.history.messages,
120
- )
121
-
122
- async def initialise(self, task: str, env_tools: list[Tool], env_info: str) -> None:
123
- self.env_tools = env_tools
124
- self.agent.receive_user_message(
125
- text=env_info,
126
- label=MessageLabel.USER_INPUT,
127
- )
128
- self.task = task
129
- self.agent.receive_user_message(
130
- text=f"Task: {task}",
131
- label=MessageLabel.USER_INPUT,
132
- )
133
- if self.config.start_with_plan:
134
- self.agent.receive_user_message(text=PLAN_USER_PROMPT, label=MessageLabel.PLAN)
135
- await self.agent.generate_output(use_tool=False)
136
-
137
- async def act(self, observation: Observation) -> Action:
138
- if observation.state.tool_responses:
139
- for tool_response in observation.state.tool_responses:
140
- await self.agent.receive_tool_message(
141
- text=f"{WEB_TOOL_TURN}\n{tool_response.content}",
142
- tool_id=tool_response.id,
143
- label=MessageLabel.TOOL_RESULT_INDUCTION,
144
- )
145
-
146
- self.agent.receive_user_message(
147
- image=observation.state.image,
148
- text=observation.state.text,
149
- label=MessageLabel.SCREENSHOT,
150
- is_base64=True,
151
- )
152
-
153
- self.agent.receive_user_message(
154
- text=REASONING_PROMPT,
155
- label=MessageLabel.REASONING_INDUCTION,
156
- )
157
-
158
- message = await self.agent.generate_structured_output(model=Reflection)
159
- self.logger.info(f"🌐 [bold blue]Observation:[/] {message.observation}")
160
-
161
- if message.status == "complete":
162
- self.complete = True
163
- return Action(tool_calls=[], text=message.ending_message)
164
-
165
- next_step = message.next_step_reasoning
166
-
167
- self.agent.receive_user_message(
168
- text=ACTION_PROMPT,
169
- label=MessageLabel.ACTION,
170
- is_base64=True,
171
- )
172
- message = await self.agent.generate_output(use_tool=True)
173
-
174
- return Action(tool_calls=message.tool_calls, text=next_step)
175
-
176
- async def is_complete(self, observation: Observation) -> bool:
177
- env_terminated = observation.terminated
178
- return self.complete or env_terminated