microsoft · cheng-tan · Apr 25, 2024 · Apr 26, 2024 · May 6, 2024 · May 6, 2024
diff --git a/autogen/agentchat/contrib/multimodal_web_surfer/multimodal_web_surfer.py b/autogen/agentchat/contrib/multimodal_web_surfer/multimodal_web_surfer.py
@@ -13,12 +13,17 @@
 from playwright.sync_api import sync_playwright
 from playwright._impl._errors import TimeoutError
 from .... import Agent, ConversableAgent, OpenAIWrapper
-from ....runtime_logging import logging_enabled, log_event
 from ....code_utils import content_str
-from .state_of_mark import add_state_of_mark
+
+from autogen.runtime_logging import logging_enabled, log_event
+from screen_parsing.ocr import OCR, OCRParsingError
+from screen_parsing.utils.state_of_mark import add_state_of_mark
+
+from importlib import resources
 
 try:
     from termcolor import colored
+
 except ImportError:
 
     def colored(x, *args, **kwargs):
@@ -71,6 +76,7 @@ def __init__(
         browser_data_dir: Optional[str] = None,
         start_page: Optional[str] = None,
         debug_dir: Optional[str] = None,
+        ocr_enabled: Optional[bool] = False,
     ):
         """
         Create a new MultimodalWebSurferAgent.
@@ -108,6 +114,7 @@ def __init__(
         # self._mlm_client = OpenAIWrapper(**self._mlm_config)
         self.start_page = start_page or self.DEFAULT_START_PAGE
         self.debug_dir = debug_dir or os.getcwd()
+        self.ocr = OCR() if ocr_enabled else None
 
         # Create the playwright instance
         launch_args = {"headless": headless}
@@ -137,7 +144,7 @@ def __init__(
         # Create the page
         self._page = self._context.new_page()
         self._page.set_viewport_size({"width": VIEWPORT_WIDTH, "height": VIEWPORT_HEIGHT})
-        self._page.add_init_script(path=os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"))
+        self._page.add_init_script(path=self._get_page_script_path())
         self._page.goto(self.start_page)
         self._page.wait_for_load_state()
         time.sleep(1)
@@ -407,7 +414,7 @@ def generate_surfer_reply(
         self._page.wait_for_load_state()
         time.sleep(1)
 
-        # Descrive the viewport of the new page in words
+        # Describe the viewport of the new page in words
         viewport = self._get_visual_viewport()
         percent_visible = int(viewport["height"] * 100 / viewport["scrollHeight"])
         percent_scrolled = int(viewport["pageTop"] * 100 / viewport["scrollHeight"])
@@ -433,12 +440,41 @@ def generate_surfer_reply(
                 percent_visible=percent_visible,
                 percent_scrolled=percent_scrolled,
             )
+
+        ocr_text = None
+        if self.ocr is not None:
+            try:
+                ocr_text = self.ocr.get_ocr_text(new_screenshot)
+            except OCRParsingError as e:
+                if logging_enabled():
+                    import traceback
+
+                    exc_type = type(e).__name__
+                    exc_message = str(e)
+                    exc_traceback = traceback.format_exc().splitlines()
+                    log_event(
+                        self,
+                        "exception_thrown_ocr_detecting",
+                        exc_type=exc_type,
+                        exc_message=exc_message,
+                        exc_traceback=exc_traceback,
+                    )
+
+        text_prompt = f"{action_description} Here is a screenshot of [{self._page.title()}]({self._page.url}). The viewport shows {percent_visible}% of the webpage, and is positioned {position_text}.".strip()
+
+        if ocr_text is not None:
+            text_prompt += ocr_text
+
         # Return the complete observation
         return True, self._make_mm_message(
-            f"{action_description} Here is a screenshot of [{self._page.title()}]({self._page.url}). The viewport shows {percent_visible}% of the webpage, and is positioned {position_text}.".strip(),
+            text_prompt,
             new_screenshot,
         )
 
+    def _get_page_script_path(self):
+        with resources.path("screen_parsing.static", "page_script.js") as path:
+            return str(path)
+
     def _image_to_data_uri(self, image):
         """
         Image can be a bytes string, a Binary file-like stream, or PIL Image.
@@ -472,23 +508,23 @@ def _make_mm_message(self, text_content, image_content):
 
     def _get_interactive_rects(self):
         try:
-            with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt") as fh:
+            with open(self._get_page_script_path(), "rt") as fh:
                 self._page.evaluate(fh.read())
         except:
             pass
         return self._page.evaluate("MultimodalWebSurfer.getInteractiveRects();")
 
     def _get_visual_viewport(self):
         try:
-            with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt") as fh:
+            with open(self._get_page_script_path(), "rt") as fh:
                 self._page.evaluate(fh.read())
         except:
             pass
         return self._page.evaluate("MultimodalWebSurfer.getVisualViewport();")
 
     def _get_focused_rect_id(self):
         try:
-            with open(os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"), "rt") as fh:
+            with open(self._get_page_script_path(), "rt") as fh:
                 self._page.evaluate(fh.read())
         except:
             pass
@@ -498,7 +534,7 @@ def _on_new_page(self, page):
         self._page = page
         self._page.set_viewport_size({"width": VIEWPORT_WIDTH, "height": VIEWPORT_HEIGHT})
         time.sleep(0.2)
-        self._page.add_init_script(path=os.path.join(os.path.abspath(os.path.dirname(__file__)), "page_script.js"))
+        self._page.add_init_script(self._get_page_script_path())
         self._page.wait_for_load_state()
 
         title = None

diff --git a/autogen/agentchat/contrib/multimodal_web_surfer/page_script.js b/autogen/agentchat/contrib/multimodal_web_surfer/page_script.js
diff --git a/autogen/agentchat/contrib/web_surfer.py b/autogen/agentchat/contrib/web_surfer.py
@@ -6,7 +6,7 @@
 from typing import Any, Dict, List, Optional, Union, Callable, Literal, Tuple
 from typing_extensions import Annotated
 from ... import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat, OpenAIWrapper
-from ...browser_utils import AbstractMarkdownBrowser, RequestsMarkdownBrowser, BingMarkdownSearch
+from autogen.browser_utils import AbstractMarkdownBrowser, RequestsMarkdownBrowser, BingMarkdownSearch
 from ...code_utils import content_str
 from ...token_count_utils import count_token, get_max_token_limit
 from ...oai.openai_utils import filter_config

diff --git a/autogen/browser_utils/__init__.py b/autogen/browser_utils/__init__.py
@@ -3,7 +3,6 @@
 from .selenium_markdown_browser import SeleniumMarkdownBrowser
 from .playwright_markdown_browser import PlaywrightMarkdownBrowser
 from .markdown_search import AbstractMarkdownSearch, BingMarkdownSearch
-from .mdconvert import MarkdownConverter, UnsupportedFormatException, FileConversionException, DocumentConverterResult
 
 __all__ = (
     "AbstractMarkdownBrowser",
@@ -12,8 +11,4 @@
     "PlaywrightMarkdownBrowser",
     "AbstractMarkdownSearch",
     "BingMarkdownSearch",
-    "MarkdownConverter",
-    "UnsupportedFormatException",
-    "FileConversionException",
-    "DocumentConverterResult",
 )
diff --git a/autogen/browser_utils/markdown_search.py b/autogen/browser_utils/markdown_search.py
@@ -5,7 +5,6 @@
 import logging
 import os
 
-from bs4 import BeautifulSoup
 from typing import Any, Dict, List, Optional, Union, Tuple
 from urllib.parse import urlparse, quote, quote_plus, unquote, urlunparse, parse_qs
 from abc import ABC, abstractmethod

diff --git a/autogen/screen_parsing/README.md b/autogen/screen_parsing/README.md
@@ -0,0 +1,8 @@
+This is the screen parsing module for autogen/multi_modal_web_surfer
+
+# Get Started
+```pip install -e .```
+
+# Run tests
+```pip install -e .[test]```
+cd test && pytest .
diff --git a/autogen/screen_parsing/__init__.py b/autogen/screen_parsing/__init__.py