Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Create screen parsing module #2530

Draft
wants to merge 21 commits into
base: ct_webarena
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,17 @@
from playwright.sync_api import sync_playwright
from playwright._impl._errors import TimeoutError
from .... import Agent, ConversableAgent, OpenAIWrapper
from ....runtime_logging import logging_enabled, log_event
from ....code_utils import content_str
from .state_of_mark import add_state_of_mark

from autogen.runtime_logging import logging_enabled, log_event
from screen_parsing.ocr import OCR, OCRParsingError
from screen_parsing.utils.state_of_mark import add_state_of_mark

from importlib import resources

try:
from termcolor import colored

except ImportError:

def colored(x, *args, **kwargs):
Expand Down Expand Up @@ -71,6 +76,7 @@ def __init__(
browser_data_dir: Optional[str] = None,
start_page: Optional[str] = None,
debug_dir: Optional[str] = None,
ocr_enabled: Optional[bool] = False,
):
"""
Create a new MultimodalWebSurferAgent.
Expand Down Expand Up @@ -108,6 +114,7 @@ def __init__(
# self._mlm_client = OpenAIWrapper(**self._mlm_config)
self.start_page = start_page or self.DEFAULT_START_PAGE
self.debug_dir = debug_dir or os.getcwd()
self.ocr = OCR() if ocr_enabled else None

# Create the playwright instance
launch_args = {"headless": headless}
Expand Down Expand Up @@ -137,7 +144,7 @@ def __init__(
# Create the page
self._page = self._context.new_page()
self._page.set_viewport_size({"width": VIEWPORT_WIDTH, "height": VIEWPORT_HEIGHT})
self._page.add_init_script(path=os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"))
self._page.add_init_script(path=self._get_page_script_path())
self._page.goto(self.start_page)
self._page.wait_for_load_state()
time.sleep(1)
Expand Down Expand Up @@ -407,7 +414,7 @@ def generate_surfer_reply(
self._page.wait_for_load_state()
time.sleep(1)

# Descrive the viewport of the new page in words
# Describe the viewport of the new page in words
viewport = self._get_visual_viewport()
percent_visible = int(viewport["height"] * 100 / viewport["scrollHeight"])
percent_scrolled = int(viewport["pageTop"] * 100 / viewport["scrollHeight"])
Expand All @@ -433,12 +440,41 @@ def generate_surfer_reply(
percent_visible=percent_visible,
percent_scrolled=percent_scrolled,
)

ocr_text = None
if self.ocr is not None:
try:
ocr_text = self.ocr.get_ocr_text(new_screenshot)
except OCRParsingError as e:
if logging_enabled():
import traceback

exc_type = type(e).__name__
exc_message = str(e)
exc_traceback = traceback.format_exc().splitlines()
log_event(
self,
"exception_thrown_ocr_detecting",
exc_type=exc_type,
exc_message=exc_message,
exc_traceback=exc_traceback,
)

text_prompt = f"{action_description} Here is a screenshot of [{self._page.title()}]({self._page.url}). The viewport shows {percent_visible}% of the webpage, and is positioned {position_text}.".strip()

if ocr_text is not None:
text_prompt += ocr_text
cheng-tan marked this conversation as resolved.
Show resolved Hide resolved

# Return the complete observation
return True, self._make_mm_message(
f"{action_description} Here is a screenshot of [{self._page.title()}]({self._page.url}). The viewport shows {percent_visible}% of the webpage, and is positioned {position_text}.".strip(),
text_prompt,
new_screenshot,
)

def _get_page_script_path(self):
with resources.path("screen_parsing.static", "page_script.js") as path:
return str(path)

def _image_to_data_uri(self, image):
"""
Image can be a bytes string, a Binary file-like stream, or PIL Image.
Expand Down Expand Up @@ -472,23 +508,23 @@ def _make_mm_message(self, text_content, image_content):

def _get_interactive_rects(self):
try:
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt") as fh:
with open(self._get_page_script_path(), "rt") as fh:
self._page.evaluate(fh.read())
except:
pass
return self._page.evaluate("MultimodalWebSurfer.getInteractiveRects();")

def _get_visual_viewport(self):
try:
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt") as fh:
with open(self._get_page_script_path(), "rt") as fh:
self._page.evaluate(fh.read())
except:
pass
return self._page.evaluate("MultimodalWebSurfer.getVisualViewport();")

def _get_focused_rect_id(self):
try:
with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt") as fh:
with open(self._get_page_script_path(), "rt") as fh:
self._page.evaluate(fh.read())
except:
pass
Expand All @@ -498,7 +534,7 @@ def _on_new_page(self, page):
self._page = page
self._page.set_viewport_size({"width": VIEWPORT_WIDTH, "height": VIEWPORT_HEIGHT})
time.sleep(0.2)
self._page.add_init_script(path=os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"))
self._page.add_init_script(self._get_page_script_path())
self._page.wait_for_load_state()

title = None
Expand Down
211 changes: 0 additions & 211 deletions autogen/agentchat/contrib/multimodal_web_surfer/page_script.js

This file was deleted.

2 changes: 1 addition & 1 deletion autogen/agentchat/contrib/web_surfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from typing import Any, Dict, List, Optional, Union, Callable, Literal, Tuple
from typing_extensions import Annotated
from ... import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat, OpenAIWrapper
from ...browser_utils import AbstractMarkdownBrowser, RequestsMarkdownBrowser, BingMarkdownSearch
from autogen.browser_utils import AbstractMarkdownBrowser, RequestsMarkdownBrowser, BingMarkdownSearch
from ...code_utils import content_str
from ...token_count_utils import count_token, get_max_token_limit
from ...oai.openai_utils import filter_config
Expand Down
5 changes: 0 additions & 5 deletions autogen/browser_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from .selenium_markdown_browser import SeleniumMarkdownBrowser
from .playwright_markdown_browser import PlaywrightMarkdownBrowser
from .markdown_search import AbstractMarkdownSearch, BingMarkdownSearch
from .mdconvert import MarkdownConverter, UnsupportedFormatException, FileConversionException, DocumentConverterResult

__all__ = (
"AbstractMarkdownBrowser",
Expand All @@ -12,8 +11,4 @@
"PlaywrightMarkdownBrowser",
"AbstractMarkdownSearch",
"BingMarkdownSearch",
"MarkdownConverter",
"UnsupportedFormatException",
"FileConversionException",
"DocumentConverterResult",
)
1 change: 0 additions & 1 deletion autogen/browser_utils/markdown_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import logging
import os

from bs4 import BeautifulSoup
from typing import Any, Dict, List, Optional, Union, Tuple
from urllib.parse import urlparse, quote, quote_plus, unquote, urlunparse, parse_qs
from abc import ABC, abstractmethod
Expand Down
8 changes: 8 additions & 0 deletions autogen/screen_parsing/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
This is the screen parsing module for autogen/multi_modal_web_surfer

# Get Started
```pip install -e .```

# Run tests
```pip install -e .[test]```
cd test && pytest .
Empty file.