Spaces:

ttomy
/

proxy-lite-demo-for-setup

Running

App Files Files Community

Alex J. Chan commited on Feb 18

Commit

11250db

2 Parent(s): 0eff0a1 fab2b20

Merge pull request #3 from convergence-ai/alex/descriptions

Browse files

Files changed (7) hide show

.gitignore +2 -1
src/proxy_lite/cli.py +17 -1
src/proxy_lite/client.py +6 -6
src/proxy_lite/configs/default.yaml +3 -0
src/proxy_lite/gif_maker.py +122 -0
src/proxy_lite/recorder.py +6 -2
src/proxy_lite/serializer.py +1 -1

.gitignore CHANGED Viewed

@@ -171,4 +171,5 @@ cython_debug/
 .pypirc
 logs/
-local_trajectories/

 .pypirc
 logs/
+local_trajectories/
+screenshots/

src/proxy_lite/cli.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import argparse
 import asyncio
 import os
 from pathlib import Path
 from typing import Optional
 from proxy_lite import Runner, RunnerConfig
 from proxy_lite.logger import logger
@@ -35,7 +37,21 @@ def do_command(args):
     if args.viewport_height:
         config.viewport_height = args.viewport_height
     o = Runner(config=config)
-    asyncio.run(o.run(do_text))
 def main():

 import argparse
 import asyncio
+import base64
 import os
 from pathlib import Path
 from typing import Optional
 from proxy_lite import Runner, RunnerConfig
+from proxy_lite.gif_maker import create_run_gif
 from proxy_lite.logger import logger
     if args.viewport_height:
         config.viewport_height = args.viewport_height
     o = Runner(config=config)
+    result = asyncio.run(o.run(do_text))
+    final_screenshot = result.observations[-1].info["original_image"]
+    folder_path = Path(__file__).parent.parent.parent / "screenshots"
+    folder_path.mkdir(parents=True, exist_ok=True)
+    path = folder_path / f"{result.run_id}.png"
+    with open(path, "wb") as f:
+        f.write(base64.b64decode(final_screenshot))
+    logger.info(f"🤖 Final screenshot saved to {path}")
+    gif_folder_path = Path(__file__).parent.parent.parent / "gifs"
+    gif_folder_path.mkdir(parents=True, exist_ok=True)
+    gif_path = gif_folder_path / f"{result.run_id}.gif"
+    create_run_gif(result, gif_path, duration=1500)
+    logger.info(f"🤖 GIF saved to {gif_path}")
 def main():

src/proxy_lite/client.py CHANGED Viewed

@@ -15,7 +15,7 @@ from proxy_lite.history import MessageHistory
 from proxy_lite.logger import logger
 from proxy_lite.serializer import (
     BaseSerializer,
-    OpenAISerializer,
 )
 from proxy_lite.tools import Tool
@@ -78,7 +78,7 @@ class OpenAIClientConfig(BaseClientConfig):
 class OpenAIClient(BaseClient):
     config: OpenAIClientConfig
-    serializer: ClassVar[OpenAISerializer] = OpenAISerializer()
     @cached_property
     def external_client(self) -> AsyncOpenAI:
@@ -119,14 +119,14 @@ class ConvergenceClientConfig(BaseClientConfig):
 class ConvergenceClient(OpenAIClient):
     config: ConvergenceClientConfig
-    serializer: ClassVar[OpenAISerializer] = OpenAISerializer()
     _model_validated: bool = False
     async def _validate_model(self) -> None:
         try:
-            await self.external_client.beta.chat.completions.parse(
-                model=self.config.model_id,
-                messages=[{"role": "user", "content": "Hello"}],
             )
             self._model_validated = True
             logger.debug(f"Model {self.config.model_id} validated and connected to cluster")

 from proxy_lite.logger import logger
 from proxy_lite.serializer import (
     BaseSerializer,
+    OpenAICompatibleSerializer,
 )
 from proxy_lite.tools import Tool
 class OpenAIClient(BaseClient):
     config: OpenAIClientConfig
+    serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
     @cached_property
     def external_client(self) -> AsyncOpenAI:
 class ConvergenceClient(OpenAIClient):
     config: ConvergenceClientConfig
+    serializer: ClassVar[OpenAICompatibleSerializer] = OpenAICompatibleSerializer()
     _model_validated: bool = False
     async def _validate_model(self) -> None:
         try:
+            response = await self.external_client.models.list()
+            assert self.config.model_id in [model.id for model in response.data], (
+                f"Model {self.config.model_id} not found in {response.data}"
             )
             self._model_validated = True
             logger.debug(f"Model {self.config.model_id} validated and connected to cluster")

src/proxy_lite/configs/default.yaml CHANGED Viewed

@@ -7,6 +7,7 @@ environment:
   include_poi_text: true
   headless: false
   homepage: https://www.google.co.uk
 solver:
   name: simple
   agent:
@@ -17,4 +18,6 @@ solver:
       api_base: https://convergence-ai-demo-api.hf.space/v1
 local_view: true
 task_timeout: 1800
 verbose: true

   include_poi_text: true
   headless: false
   homepage: https://www.google.co.uk
+  keep_original_image: true
 solver:
   name: simple
   agent:
       api_base: https://convergence-ai-demo-api.hf.space/v1
 local_view: true
 task_timeout: 1800
+environment_timeout: 1800
+action_timeout: 1800
 verbose: true

src/proxy_lite/gif_maker.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import base64
+import re
+import textwrap
+from io import BytesIO
+from PIL import Image, ImageDraw, ImageFont
+from proxy_lite.environments.environment_base import Action, Observation
+from proxy_lite.recorder import Run
+def create_run_gif(
+    run: Run, output_path: str, white_panel_width: int = 300, duration: int = 1500, resize_factor: int = 4
+) -> None:
+    """
+    Generate a gif from the Run object's history.
+    For each Observation record, the observation image is decoded from its base64
+    encoded string. If the next record is an Action, its text is drawn onto a
+    white panel. The observation image and the white panel are then concatenated
+    horizontally to produce a frame.
+    Parameters:
+        run (Run): A Run object with its history containing Observation and Action records.
+        output_path (str): The path where the GIF will be saved.
+        white_panel_width (int): The width of the white panel for displaying text.
+                                 Default increased to 400 for larger images.
+        duration (int): Duration between frames in milliseconds.
+                        Increased here to slow the FPS (default is 1000ms).
+        resize_factor (int): The factor to resize the image down by.
+    """
+    frames = []
+    history = run.history
+    i = 0
+    while i < len(history):
+        if isinstance(history[i], Observation):
+            observation = history[i]
+            image_data = observation.state.image
+            if not image_data:
+                i += 1
+                continue
+            # Decode the base64 image
+            image_bytes = base64.b64decode(image_data)
+            obs_img = Image.open(BytesIO(image_bytes)).convert("RGB")
+            # scale the image down
+            obs_img = obs_img.resize((obs_img.width // resize_factor, obs_img.height // resize_factor))
+            # Check if the next record is an Action and extract its text if available
+            action_text = ""
+            if i + 1 < len(history) and isinstance(history[i + 1], Action):
+                action = history[i + 1]
+                if action.text:
+                    action_text = action.text
+            # extract observation and thinking from tags in the action text
+            observation_match = re.search(r"<observation>(.*?)</observation>", action_text, re.DOTALL)
+            observation_content = observation_match.group(1).strip() if observation_match else None
+            # Extract text between thinking tags if present
+            thinking_match = re.search(r"<thinking>(.*?)</thinking>", action_text, re.DOTALL)
+            thinking_content = thinking_match.group(1).strip() if thinking_match else None
+            if observation_content and thinking_content:
+                action_text = f"**OBSERVATION**\n{observation_content}\n\n**THINKING**\n{thinking_content}"
+            # Create a white panel (same height as the observation image)
+            panel = Image.new("RGB", (white_panel_width, obs_img.height), "white")
+            draw = ImageDraw.Draw(panel)
+            font = ImageFont.load_default()
+            # Wrap the action text if it is too long
+            max_chars_per_line = 40  # Adjusted for larger font size
+            wrapped_text = textwrap.fill(action_text, width=max_chars_per_line)
+            # Calculate text block size and center it on the panel
+            try:
+                # Use multiline_textbbox if available (returns bounding box tuple)
+                bbox = draw.multiline_textbbox((0, 0), wrapped_text, font=font)
+                text_width, text_height = bbox[2] - bbox[0], bbox[3] - bbox[1]
+            except AttributeError:
+                # Fallback for older Pillow versions: compute size for each line
+                lines = wrapped_text.splitlines() or [wrapped_text]
+                line_sizes = [draw.textsize(line, font=font) for line in lines]
+                text_width = max(width for width, _ in line_sizes)
+                text_height = sum(height for _, height in line_sizes)
+            text_x = (white_panel_width - text_width) // 2
+            text_y = (obs_img.height - text_height) // 2
+            draw.multiline_text((text_x, text_y), wrapped_text, fill="black", font=font, align="center")
+            # Create the combined frame by concatenating the observation image and the panel
+            total_width = obs_img.width + white_panel_width
+            combined_frame = Image.new("RGB", (total_width, obs_img.height))
+            combined_frame.paste(obs_img, (0, 0))
+            combined_frame.paste(panel, (obs_img.width, 0))
+            frames.append(combined_frame)
+            # Skip the Action record since it has been processed with this Observation
+            if i + 1 < len(history) and isinstance(history[i + 1], Action):
+                i += 2
+            else:
+                i += 1
+        else:
+            i += 1
+    if frames:
+        frames[0].save(output_path, save_all=True, append_images=frames[1:], duration=duration, loop=0)
+    else:
+        raise ValueError("No frames were generated from the Run object's history.")
+# Example usage:
+if __name__ == "__main__":
+    from proxy_lite.recorder import Run
+    dummy_run = Run.load("0abdb4cb-f289-48b0-ba13-35ed1210f7c1")
+    num_steps = int(len(dummy_run.history) / 2)
+    print(f"Number of steps: {num_steps}")
+    output_gif_path = "trajectory.gif"
+    create_run_gif(dummy_run, output_gif_path, duration=1000)
+    print(f"Trajectory GIF saved to {output_gif_path}")

src/proxy_lite/recorder.py CHANGED Viewed

@@ -3,7 +3,6 @@ from __future__ import annotations
 import datetime
 import json
 import os
-import sys
 import uuid
 from pathlib import Path
 from typing import Any, Optional, Self
@@ -39,6 +38,11 @@ class Run(BaseModel):
             created_at=str(datetime.datetime.now(datetime.UTC)),
         )
     @property
     def observations(self) -> list[Observation]:
         return [h for h in self.history if isinstance(h, Observation)]
@@ -80,7 +84,7 @@ class DataRecorder:
         self.local_folder = local_folder
     def initialise_run(self, task: str) -> Run:
-        self.local_folder = Path(os.path.abspath(".")) / "local_trajectories"
         os.makedirs(self.local_folder, exist_ok=True)
         return Run.initialise(task)

 import datetime
 import json
 import os
 import uuid
 from pathlib import Path
 from typing import Any, Optional, Self
             created_at=str(datetime.datetime.now(datetime.UTC)),
         )
+    @classmethod
+    def load(cls, run_id: str) -> Self:
+        with open(Path(__file__).parent.parent.parent / "local_trajectories" / f"{run_id}.json", "r") as f:
+            return cls(**json.load(f))
     @property
     def observations(self) -> list[Observation]:
         return [h for h in self.history if isinstance(h, Observation)]
         self.local_folder = local_folder
     def initialise_run(self, task: str) -> Run:
+        self.local_folder = Path(__file__).parent.parent.parent / "local_trajectories"
         os.makedirs(self.local_folder, exist_ok=True)
         return Run.initialise(task)

src/proxy_lite/serializer.py CHANGED Viewed

@@ -25,7 +25,7 @@ class BaseSerializer(BaseModel, ABC):
     def serialize_tools(self, tools: list[Tool]) -> list[dict]: ...
-class OpenAISerializer(BaseSerializer):
     def serialize_messages(self, message_history: MessageHistory) -> list[dict]:
         return message_history.to_dict(exclude={"label"})

     def serialize_tools(self, tools: list[Tool]) -> list[dict]: ...
+class OpenAICompatibleSerializer(BaseSerializer):
     def serialize_messages(self, message_history: MessageHistory) -> list[dict]:
         return message_history.to_dict(exclude={"label"})