Spaces:

ar08
/

zzz

Runtime error

App Files Files Community

zzz / openhands /agenthub /codeact_agent /function_calling.py

ar08

Upload 1040 files

246d201 verified 2 months ago

raw

history blame contribute delete

23.5 kB

	"""This file contains the function calling implementation for different actions.

	This is similar to the functionality of `CodeActResponseParser`.
	"""

	import json

	from browsergym.core.action.highlevel import HighLevelActionSet
	from litellm import (
	ChatCompletionToolParam,
	ChatCompletionToolParamFunctionChunk,
	ModelResponse,
	)

	from openhands.core.exceptions import FunctionCallNotExistsError
	from openhands.core.logger import openhands_logger as logger
	from openhands.events.action import (
	Action,
	AgentDelegateAction,
	AgentFinishAction,
	BrowseInteractiveAction,
	BrowseURLAction,
	CmdRunAction,
	FileEditAction,
	FileReadAction,
	IPythonRunCellAction,
	MessageAction,
	)
	from openhands.events.event import FileEditSource, FileReadSource
	from openhands.events.tool import ToolCallMetadata

	_BASH_DESCRIPTION = """Execute a bash command in the terminal.
	* Long running commands: For commands that may run indefinitely, it should be run in the background and the output should be redirected to a file, e.g. command = `python3 app.py > server.log 2>&1 &`.
	* Interact with running process: If a bash command returns exit code `-1`, this means the process is not yet finished. By setting `is_input` to `true`, the assistant can interact with the running process and send empty `command` to retrieve any additional logs, or send additional text (set `command` to the text) to STDIN of the running process, or send command like `C-c` (Ctrl+C), `C-d` (Ctrl+D), `C-z` (Ctrl+Z) to interrupt the process.
	"""

	CmdRunTool = ChatCompletionToolParam(
	type='function',
	function=ChatCompletionToolParamFunctionChunk(
	name='execute_bash',
	description=_BASH_DESCRIPTION,
	parameters={
	'type': 'object',
	'properties': {
	'command': {
	'type': 'string',
	'description': 'The bash command to execute. Can be empty string to view additional logs when previous exit code is `-1`. Can be `C-c` (Ctrl+C) to interrupt the currently running process.',
	},
	'is_input': {
	'type': 'string',
	'description': 'If True, the command is an input to the running process. If False, the command is a bash command to be executed in the terminal. Default is False.',
	'enum': ['true', 'false'],
	},
	},
	'required': ['command'],
	},
	),
	)

	_IPYTHON_DESCRIPTION = """Run a cell of Python code in an IPython environment.
	* The assistant should define variables and import packages before using them.
	* The variable defined in the IPython environment will not be available outside the IPython environment (e.g., in terminal).
	"""

	IPythonTool = ChatCompletionToolParam(
	type='function',
	function=ChatCompletionToolParamFunctionChunk(
	name='execute_ipython_cell',
	description=_IPYTHON_DESCRIPTION,
	parameters={
	'type': 'object',
	'properties': {
	'code': {
	'type': 'string',
	'description': 'The Python code to execute. Supports magic commands like %pip.',
	},
	},
	'required': ['code'],
	},
	),
	)

	_FILE_EDIT_DESCRIPTION = """Edit a file.
	* The assistant can edit files by specifying the file path and providing a draft of the new file content.
	* The draft content doesn't need to be exactly the same as the existing file; the assistant may skip unchanged lines using comments like `# unchanged` to indicate unchanged sections.
	* IMPORTANT: For large files (e.g., > 300 lines), specify the range of lines to edit using `start` and `end` (1-indexed, inclusive). The range should be smaller than 300 lines.
	* To append to a file, set both `start` and `end` to `-1`.
	* If the file doesn't exist, a new file will be created with the provided content.

	Example 1: general edit for short files
	For example, given an existing file `/path/to/file.py` that looks like this:
	(this is the end of the file)
	1\|class MyClass:
	2\| def __init__(self):
	3\| self.x = 1
	4\| self.y = 2
	5\| self.z = 3
	6\|
	7\|print(MyClass().z)
	8\|print(MyClass().x)
	(this is the end of the file)

	The assistant wants to edit the file to look like this:
	(this is the end of the file)
	1\|class MyClass:
	2\| def __init__(self):
	3\| self.x = 1
	4\| self.y = 2
	5\|
	6\|print(MyClass().y)
	(this is the end of the file)

	The assistant may produce an edit action like this:
	path="/path/to/file.txt" start=1 end=-1
	content=```
	class MyClass:
	def __init__(self):
	# no changes before
	self.y = 2
	# self.z is removed

	# MyClass().z is removed
	print(MyClass().y)
	```

	Example 2: append to file for short files
	For example, given an existing file `/path/to/file.py` that looks like this:
	(this is the end of the file)
	1\|class MyClass:
	2\| def __init__(self):
	3\| self.x = 1
	4\| self.y = 2
	5\| self.z = 3
	6\|
	7\|print(MyClass().z)
	8\|print(MyClass().x)
	(this is the end of the file)

	To append the following lines to the file:
	```python
	print(MyClass().y)
	```

	The assistant may produce an edit action like this:
	path="/path/to/file.txt" start=-1 end=-1
	content=```
	print(MyClass().y)
	```

	Example 3: edit for long files

	Given an existing file `/path/to/file.py` that looks like this:
	(1000 more lines above)
	1001\|class MyClass:
	1002\| def __init__(self):
	1003\| self.x = 1
	1004\| self.y = 2
	1005\| self.z = 3
	1006\|
	1007\|print(MyClass().z)
	1008\|print(MyClass().x)
	(2000 more lines below)

	The assistant wants to edit the file to look like this:

	(1000 more lines above)
	1001\|class MyClass:
	1002\| def __init__(self):
	1003\| self.x = 1
	1004\| self.y = 2
	1005\|
	1006\|print(MyClass().y)
	(2000 more lines below)

	The assistant may produce an edit action like this:
	path="/path/to/file.txt" start=1001 end=1008
	content=```
	class MyClass:
	def __init__(self):
	# no changes before
	self.y = 2
	# self.z is removed

	# MyClass().z is removed
	print(MyClass().y)
	```
	"""

	LLMBasedFileEditTool = ChatCompletionToolParam(
	type='function',
	function=ChatCompletionToolParamFunctionChunk(
	name='edit_file',
	description=_FILE_EDIT_DESCRIPTION,
	parameters={
	'type': 'object',
	'properties': {
	'path': {
	'type': 'string',
	'description': 'The absolute path to the file to be edited.',
	},
	'content': {
	'type': 'string',
	'description': 'A draft of the new content for the file being edited. Note that the assistant may skip unchanged lines.',
	},
	'start': {
	'type': 'integer',
	'description': 'The starting line number for the edit (1-indexed, inclusive). Default is 1.',
	},
	'end': {
	'type': 'integer',
	'description': 'The ending line number for the edit (1-indexed, inclusive). Default is -1 (end of file).',
	},
	},
	'required': ['path', 'content'],
	},
	),
	)

	_STR_REPLACE_EDITOR_DESCRIPTION = """Custom editing tool for viewing, creating and editing files
	* State is persistent across command calls and discussions with the user
	* If `path` is a file, `view` displays the result of applying `cat -n`. If `path` is a directory, `view` lists non-hidden files and directories up to 2 levels deep
	* The `create` command cannot be used if the specified `path` already exists as a file
	* If a `command` generates a long output, it will be truncated and marked with `<response clipped>`
	* The `undo_edit` command will revert the last edit made to the file at `path`

	Notes for using the `str_replace` command:
	* The `old_str` parameter should match EXACTLY one or more consecutive lines from the original file. Be mindful of whitespaces!
	* If the `old_str` parameter is not unique in the file, the replacement will not be performed. Make sure to include enough context in `old_str` to make it unique
	* The `new_str` parameter should contain the edited lines that should replace the `old_str`
	"""

	StrReplaceEditorTool = ChatCompletionToolParam(
	type='function',
	function=ChatCompletionToolParamFunctionChunk(
	name='str_replace_editor',
	description=_STR_REPLACE_EDITOR_DESCRIPTION,
	parameters={
	'type': 'object',
	'properties': {
	'command': {
	'description': 'The commands to run. Allowed options are: `view`, `create`, `str_replace`, `insert`, `undo_edit`.',
	'enum': ['view', 'create', 'str_replace', 'insert', 'undo_edit'],
	'type': 'string',
	},
	'path': {
	'description': 'Absolute path to file or directory, e.g. `/workspace/file.py` or `/workspace`.',
	'type': 'string',
	},
	'file_text': {
	'description': 'Required parameter of `create` command, with the content of the file to be created.',
	'type': 'string',
	},
	'old_str': {
	'description': 'Required parameter of `str_replace` command containing the string in `path` to replace.',
	'type': 'string',
	},
	'new_str': {
	'description': 'Optional parameter of `str_replace` command containing the new string (if not given, no string will be added). Required parameter of `insert` command containing the string to insert.',
	'type': 'string',
	},
	'insert_line': {
	'description': 'Required parameter of `insert` command. The `new_str` will be inserted AFTER the line `insert_line` of `path`.',
	'type': 'integer',
	},
	'view_range': {
	'description': 'Optional parameter of `view` command when `path` points to a file. If none is given, the full file is shown. If provided, the file will be shown in the indicated line number range, e.g. [11, 12] will show lines 11 and 12. Indexing at 1 to start. Setting `[start_line, -1]` shows all lines from `start_line` to the end of the file.',
	'items': {'type': 'integer'},
	'type': 'array',
	},
	},
	'required': ['command', 'path'],
	},
	),
	)


	_WEB_DESCRIPTION = """Read (convert to markdown) content from a webpage. You should prefer using the `web_read` tool over the `browser` tool, but do use the `browser` tool if you need to interact with a webpage (e.g., click a button, fill out a form, etc.).

	You may use the `web_read` tool to read content from a webpage, and even search the webpage content using a Google search query (e.g., url=`https://www.google.com/search?q=YOUR_QUERY`).
	"""

	WebReadTool = ChatCompletionToolParam(
	type='function',
	function=ChatCompletionToolParamFunctionChunk(
	name='web_read',
	description=_WEB_DESCRIPTION,
	parameters={
	'type': 'object',
	'properties': {
	'url': {
	'type': 'string',
	'description': 'The URL of the webpage to read. You can also use a Google search query here (e.g., `https://www.google.com/search?q=YOUR_QUERY`).',
	}
	},
	'required': ['url'],
	},
	),
	)

	# from browsergym/core/action/highlevel.py
	_browser_action_space = HighLevelActionSet(
	subsets=['bid', 'nav'],
	strict=False, # less strict on the parsing of the actions
	multiaction=True, # enable to agent to take multiple actions at once
	)


	_BROWSER_DESCRIPTION = """Interact with the browser using Python code. Use it ONLY when you need to interact with a webpage.

	See the description of "code" parameter for more details.

	Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
	More than 2-3 actions usually leads to failure or unexpected behavior. Example:
	fill('a12', 'example with "quotes"')
	click('a51')
	click('48', button='middle', modifiers=['Shift'])
	"""

	_BROWSER_TOOL_DESCRIPTION = """
	The following 15 functions are available. Nothing else is supported.

	goto(url: str)
	Description: Navigate to a url.
	Examples:
	goto('http://www.example.com')

	go_back()
	Description: Navigate to the previous page in history.
	Examples:
	go_back()

	go_forward()
	Description: Navigate to the next page in history.
	Examples:
	go_forward()

	noop(wait_ms: float = 1000)
	Description: Do nothing, and optionally wait for the given time (in milliseconds).
	You can use this to get the current page content and/or wait for the page to load.
	Examples:
	noop()

	noop(500)

	scroll(delta_x: float, delta_y: float)
	Description: Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
	Examples:
	scroll(0, 200)

	scroll(-50.2, -100.5)

	fill(bid: str, value: str)
	Description: Fill out a form field. It focuses the element and triggers an input event with the entered text. It works for <input>, <textarea> and [contenteditable] elements.
	Examples:
	fill('237', 'example value')

	fill('45', 'multi-line\nexample')

	fill('a12', 'example with "quotes"')

	select_option(bid: str, options: str \| list[str])
	Description: Select one or multiple options in a <select> element. You can specify option value or label to select. Multiple options can be selected.
	Examples:
	select_option('a48', 'blue')

	select_option('c48', ['red', 'green', 'blue'])

	click(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
	Description: Click an element.
	Examples:
	click('a51')

	click('b22', button='right')

	click('48', button='middle', modifiers=['Shift'])

	dblclick(bid: str, button: Literal['left', 'middle', 'right'] = 'left', modifiers: list[typing.Literal['Alt', 'Control', 'ControlOrMeta', 'Meta', 'Shift']] = [])
	Description: Double click an element.
	Examples:
	dblclick('12')

	dblclick('ca42', button='right')

	dblclick('178', button='middle', modifiers=['Shift'])

	hover(bid: str)
	Description: Hover over an element.
	Examples:
	hover('b8')

	press(bid: str, key_comb: str)
	Description: Focus the matching element and press a combination of keys. It accepts the logical key names that are emitted in the keyboardEvent.key property of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can alternatively specify a single character you'd like to produce such as "a" or "#". Following modification shortcuts are also supported: Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on Windows and Linux and to Meta on macOS.
	Examples:
	press('88', 'Backspace')

	press('a26', 'ControlOrMeta+a')

	press('a61', 'Meta+Shift+t')

	focus(bid: str)
	Description: Focus the matching element.
	Examples:
	focus('b455')

	clear(bid: str)
	Description: Clear the input field.
	Examples:
	clear('996')

	drag_and_drop(from_bid: str, to_bid: str)
	Description: Perform a drag & drop. Hover the element that will be dragged. Press left mouse button. Move mouse to the element that will receive the drop. Release left mouse button.
	Examples:
	drag_and_drop('56', '498')

	upload_file(bid: str, file: str \| list[str])
	Description: Click an element and wait for a "filechooser" event, then select one or multiple input files for upload. Relative file paths are resolved relative to the current working directory. An empty list clears the selected files.
	Examples:
	upload_file('572', '/home/user/my_receipt.pdf')

	upload_file('63', ['/home/bob/Documents/image.jpg', '/home/bob/Documents/file.zip'])
	"""


	for _, action in _browser_action_space.action_set.items():
	assert (
	action.signature in _BROWSER_TOOL_DESCRIPTION
	), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.signature}'
	assert (
	action.description in _BROWSER_TOOL_DESCRIPTION
	), f'Browser description mismatch. Please double check if the BrowserGym updated their action space.\n\nAction: {action.description}'

	BrowserTool = ChatCompletionToolParam(
	type='function',
	function=ChatCompletionToolParamFunctionChunk(
	name='browser',
	description=_BROWSER_DESCRIPTION,
	parameters={
	'type': 'object',
	'properties': {
	'code': {
	'type': 'string',
	'description': (
	'The Python code that interacts with the browser.\n'
	+ _BROWSER_TOOL_DESCRIPTION
	),
	}
	},
	'required': ['code'],
	},
	),
	)

	_FINISH_DESCRIPTION = """Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task."""

	FinishTool = ChatCompletionToolParam(
	type='function',
	function=ChatCompletionToolParamFunctionChunk(
	name='finish',
	description=_FINISH_DESCRIPTION,
	),
	)


	def combine_thought(action: Action, thought: str) -> Action:
	if not hasattr(action, 'thought'):
	return action
	if thought:
	action.thought = thought
	return action


	def response_to_actions(response: ModelResponse) -> list[Action]:
	actions: list[Action] = []
	assert len(response.choices) == 1, 'Only one choice is supported for now'
	assistant_msg = response.choices[0].message
	if assistant_msg.tool_calls:
	# Check if there's assistant_msg.content. If so, add it to the thought
	thought = ''
	if isinstance(assistant_msg.content, str):
	thought = assistant_msg.content
	elif isinstance(assistant_msg.content, list):
	for msg in assistant_msg.content:
	if msg['type'] == 'text':
	thought += msg['text']

	# Process each tool call to OpenHands action
	for i, tool_call in enumerate(assistant_msg.tool_calls):
	action: Action
	try:
	arguments = json.loads(tool_call.function.arguments)
	except json.decoder.JSONDecodeError as e:
	raise RuntimeError(
	f'Failed to parse tool call arguments: {tool_call.function.arguments}'
	) from e
	if tool_call.function.name == 'execute_bash':
	# this is an LLM error: add empty command to avoid breaking the tool call
	if 'command' not in arguments:
	arguments['command'] = ''
	# convert is_input to boolean
	if 'is_input' in arguments:
	arguments['is_input'] = arguments['is_input'] == 'true'
	action = CmdRunAction(**arguments)
	elif tool_call.function.name == 'execute_ipython_cell':
	action = IPythonRunCellAction(**arguments)
	elif tool_call.function.name == 'delegate_to_browsing_agent':
	action = AgentDelegateAction(
	agent='BrowsingAgent',
	inputs=arguments,
	)
	elif tool_call.function.name == 'finish':
	action = AgentFinishAction()
	elif tool_call.function.name == 'edit_file':
	action = FileEditAction(**arguments)
	elif tool_call.function.name == 'str_replace_editor':
	# We implement this in agent_skills, which can be used via Jupyter
	# convert tool_call.function.arguments to kwargs that can be passed to file_editor
	code = f'print(file_editor(**{arguments}))'
	logger.debug(
	f'TOOL CALL: str_replace_editor -> file_editor with code: {code}'
	)

	if arguments['command'] == 'view':
	action = FileReadAction(
	path=arguments['path'],
	translated_ipython_code=code,
	impl_source=FileReadSource.OH_ACI,
	)
	else:
	action = FileEditAction(
	path=arguments['path'],
	content='', # dummy value -- we don't need it
	translated_ipython_code=code,
	impl_source=FileEditSource.OH_ACI,
	)
	elif tool_call.function.name == 'browser':
	action = BrowseInteractiveAction(browser_actions=arguments['code'])
	elif tool_call.function.name == 'web_read':
	action = BrowseURLAction(url=arguments['url'])
	else:
	raise FunctionCallNotExistsError(
	f'Tool {tool_call.function.name} is not registered. (arguments: {arguments}). Please check the tool name and retry with an existing tool.'
	)

	# We only add thought to the first action
	if i == 0:
	action = combine_thought(action, thought)
	# Add metadata for tool calling
	action.tool_call_metadata = ToolCallMetadata(
	tool_call_id=tool_call.id,
	function_name=tool_call.function.name,
	model_response=response,
	total_calls_in_response=len(assistant_msg.tool_calls),
	)
	actions.append(action)
	else:
	actions.append(
	MessageAction(content=assistant_msg.content, wait_for_response=True)
	)

	assert len(actions) >= 1
	return actions


	def get_tools(
	codeact_enable_browsing: bool = False,
	codeact_enable_llm_editor: bool = False,
	codeact_enable_jupyter: bool = False,
	) -> list[ChatCompletionToolParam]:
	tools = [CmdRunTool, FinishTool]
	if codeact_enable_browsing:
	tools.append(WebReadTool)
	tools.append(BrowserTool)
	if codeact_enable_jupyter:
	tools.append(IPythonTool)
	if codeact_enable_llm_editor:
	tools.append(LLMBasedFileEditTool)
	else:
	tools.append(StrReplaceEditorTool)
	return tools