File size: 5,019 Bytes
246d201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import ast
import re

from openhands.controller.action_parser import ActionParser, ResponseParser
from openhands.core.logger import openhands_logger as logger
from openhands.events.action import (
    Action,
    BrowseInteractiveAction,
)


class BrowsingResponseParser(ResponseParser):
    def __init__(self):
        # Need to pay attention to the item order in self.action_parsers
        super().__init__()
        self.action_parsers = [BrowsingActionParserMessage()]
        self.default_parser = BrowsingActionParserBrowseInteractive()

    def parse(self, response: str) -> Action:
        action_str = self.parse_response(response)
        return self.parse_action(action_str)

    def parse_response(self, response) -> str:
        action_str = response['choices'][0]['message']['content']
        if action_str is None:
            return ''
        action_str = action_str.strip()
        # Ensure action_str ends with ')```'
        if action_str:
            if not action_str.endswith('```'):
                if action_str.endswith(')'):
                    action_str += '```'  # prevent duplicate ending paranthesis, e.g. send_msg_to_user('Done'))
                else:
                    action_str += ')```'  # expected format
        logger.debug(action_str)
        return action_str

    def parse_action(self, action_str: str) -> Action:
        for action_parser in self.action_parsers:
            if action_parser.check_condition(action_str):
                return action_parser.parse(action_str)
        return self.default_parser.parse(action_str)


class BrowsingActionParserMessage(ActionParser):
    """Parser action:

    - BrowseInteractiveAction(browser_actions) - unexpected response format, message back to user

    """

    def __init__(

        self,

    ):
        pass

    def check_condition(self, action_str: str) -> bool:
        return '```' not in action_str

    def parse(self, action_str: str) -> Action:
        msg = f'send_msg_to_user("""{action_str}""")'
        return BrowseInteractiveAction(
            browser_actions=msg,
            thought=action_str,
            browsergym_send_msg_to_user=action_str,
        )


class BrowsingActionParserBrowseInteractive(ActionParser):
    """Parser action:

    - BrowseInteractiveAction(browser_actions) - handle send message to user function call in BrowserGym

    """

    def __init__(

        self,

    ):
        pass

    def check_condition(self, action_str: str) -> bool:
        return True

    def parse(self, action_str: str) -> Action:
        # parse the action string into browser_actions and thought
        # the LLM can return only one string, or both

        # when both are returned, it looks like this:
        ### Based on the current state of the page and the goal of finding out the president of the USA, the next action should involve searching for information related to the president.
        ### To achieve this, we can navigate to a reliable source such as a search engine or a specific website that provides information about the current president of the USA.
        ### Here is an example of a valid action to achieve this:
        ### ```
        ### goto('https://www.whitehouse.gov/about-the-white-house/presidents/'
        # in practice, BrowsingResponseParser.parse_response also added )``` to the end of the string

        # when the LLM returns only one string, it looks like this:
        ### goto('https://www.whitehouse.gov/about-the-white-house/presidents/')
        # and parse_response added )``` to the end of the string
        parts = action_str.split('```')
        browser_actions = (
            parts[1].strip() if parts[1].strip() != '' else parts[0].strip()
        )
        thought = parts[0].strip() if parts[1].strip() != '' else ''

        # if the LLM wants to talk to the user, we extract the message
        msg_content = ''
        for sub_action in browser_actions.split('\n'):
            if 'send_msg_to_user(' in sub_action:
                try:
                    tree = ast.parse(sub_action)
                    args = tree.body[0].value.args  # type: ignore
                    msg_content = args[0].value
                except SyntaxError:
                    logger.error(f'Error parsing action: {sub_action}')
                    # the syntax was not correct, but we can still try to get the message
                    # e.g. send_msg_to_user("Hello, world!") or send_msg_to_user('Hello, world!'
                    match = re.search(r'send_msg_to_user\((["\'])(.*?)\1\)', sub_action)
                    if match:
                        msg_content = match.group(2)
                    else:
                        msg_content = ''

        return BrowseInteractiveAction(
            browser_actions=browser_actions,
            thought=thought,
            browsergym_send_msg_to_user=msg_content,
        )