Add caption support to tables

Unstructured-IO · Mar 15, 2024 · cfffad4 · cfffad4
1 parent 3783b44
commit cfffad4
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 24 deletions.
diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py
@@ -24,7 +24,7 @@
     NarrativeText,
     Table,
     Text,
-    Title,
+    Title
 )
 from unstructured.documents.xml import VALID_PARSERS, XMLDocument
 from unstructured.logger import logger
@@ -37,11 +37,14 @@
 )
 from unstructured.utils import htmlify_matrix_of_cell_texts
 
+CAPTION_TAG: str = "caption"
+CAPTION_TAG_START: str = f"<{CAPTION_TAG}>"
+CAPTION_TAG_END: str = f"</{CAPTION_TAG}>"
 TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "font"]
 LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
 LIST_TAGS: Final[List[str]] = ["ul", "ol", "dl"]
 HEADING_TAGS: Final[List[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"]
-TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr"]
+TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr", CAPTION_TAG]
 TEXTBREAK_TAGS: Final[List[str]] = ["br"]
 PAGEBREAK_TAGS: Final[List[str]] = ["hr"]
 EMPTY_TAGS: Final[List[str]] = PAGEBREAK_TAGS + TEXTBREAK_TAGS
@@ -103,7 +106,6 @@ class HTMLListItem(TagsMixin, ListItem):
 class HTMLTable(TagsMixin, Table):
     """NarrativeText with tag information"""
 
-
 def has_table_ancestor(element: TagsMixin) -> bool:
     """Checks to see if an element has ancestors that are table elements. If so, we consider
     it to be a table element rather than a section of narrative text."""
@@ -337,7 +339,7 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
     # -- cell within the table within the cell too.)
 
     trs = cast(
-        List[etree._Element], table_elem.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr")
+        List[etree._Element], table_elem.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr | ./caption")
     )
 
     if not trs:
@@ -346,17 +348,21 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
     def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
         """Generate the text of each cell in `tr`."""
         # -- a cell can be either a "data" cell (td) or a "heading" cell (th) --
-        tds = cast(List[etree._Element], tr.xpath("./td | ./th"))
-        for td in tds:
-            # -- a cell can contain other elements like spans etc. so we can't count on the text
-            # -- being directly below the `<td>` element. `.itertext()` gets all of it recursively.
-            # -- Filter out whitespace text nodes that result from HTML formatting.
-            stripped_text_nodes = (t.strip() for t in cast(Iterator[str], td.itertext()))
-            yield " ".join(t for t in stripped_text_nodes if t)
+        if tr.tag == "caption":
+            stripped_text_nodes = (t.strip() for t in cast(Iterator[str], tr.itertext()))
+            yield " ".join(CAPTION_TAG_START+t+CAPTION_TAG_END for t in stripped_text_nodes if t)
+        else:
+            tds = cast(List[etree._Element], tr.xpath("./td | ./th"))
+            for td in tds:
+                # -- a cell can contain other elements like spans etc. so we can't count on the text
+                # -- being directly below the `<td>` element. `.itertext()` gets all of it recursively.
+                # -- Filter out whitespace text nodes that result from HTML formatting.
+                stripped_text_nodes = (t.strip() for t in cast(Iterator[str], td.itertext()))
+                yield " ".join(t for t in stripped_text_nodes if t)
 
     table_data = [list(iter_cell_texts(tr)) for tr in trs]
-    html_table = htmlify_matrix_of_cell_texts(table_data)
-    table_text = " ".join(" ".join(t for t in row if t) for row in table_data).strip()
+    html_table = htmlify_matrix_of_cell_texts(table_data, CAPTION_TAG_START)
+    table_text = " ".join(" ".join(t.replace(CAPTION_TAG_START, "\n").replace(CAPTION_TAG_END, "") for t in row if t) for row in table_data).strip()
 
     if table_text == "":
         return None

diff --git a/unstructured/utils.py b/unstructured/utils.py
@@ -39,7 +39,7 @@
 _P = ParamSpec("_P")
 
 
-def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
+def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]], CAPTION_TAG_START: str) -> str:
     """Form an HTML table from "rows" and "columns" of `matrix`.
 
     Character overhead is minimized:
@@ -55,16 +55,22 @@ def iter_trs(rows_of_cell_strs: Sequence[Sequence[str]]) -> Iterator[str]:
             # -- suppress emission of rows with no cells --
             if not row_cell_strs:
                 continue
-            yield f"<tr>{''.join(iter_tds(row_cell_strs))}</tr>"
-
-    def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
-        for s in row_cell_strs:
-            # -- take care of things like '<' and '>' in the text --
-            s = html.escape(s)
-            # -- substitute <br/> elements for line-feeds in the text --
-            s = "<br/>".join(s.split("\n"))
-            # -- strip leading and trailing whitespace, wrap it up and go --
-            yield f"<td>{s.strip()}</td>"
+            tds = ""
+            for s in row_cell_strs:
+                if s.find(CAPTION_TAG_START) == 0:
+                    yield s
+                else:
+                    tds += iter_tds(s)
+            if tds != "":
+                yield f"<tr>{tds}</tr>"
+
+    def iter_tds(s: str) -> str:
+        # -- take care of things like '<' and '>' in the text --
+        s = html.escape(s)
+        # -- substitute <br/> elements for line-feeds in the text --
+        s = "<br/>".join(s.split("\n"))
+        # -- strip leading and trailing whitespace, wrap it up and go --
+        return f"<td>{s.strip()}</td>"
 
     return f"<table>{''.join(iter_trs(matrix))}</table>" if matrix else ""