spike: fix short-row DOCX table

Unstructured-IO · Apr 29, 2024 · 0db3eeb · 0db3eeb
1 parent 7720e72
commit 0db3eeb
Show file tree

Hide file tree

Showing 7 changed files with 294 additions and 18 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.13.6-dev0
+
+### Enhancements
+
+### Features
+
+### Fixes
+
+* **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.
+
 ## 0.13.5
 
 ### Enhancements

diff --git a/example-docs/tables-with-incomplete-rows.docx b/example-docs/tables-with-incomplete-rows.docx
diff --git a/test_unstructured/partition/docx/test_docx.py b/test_unstructured/partition/docx/test_docx.py
@@ -16,7 +16,6 @@
     Address,
     CompositeElement,
     Element,
-    ElementType,
     Footer,
     Header,
     ListItem,
@@ -132,6 +131,133 @@ def but_the_text_of_a_merged_cell_appears_only_once(self):
         table = docx.Document(example_doc_path("docx-tables.docx")).tables[2]
         assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e"
 
+    def it_can_partition_tables_with_incomplete_rows(self):
+        """DOCX permits table rows to start late and end early.
+
+        It is relatively rare in the wild, but DOCX tables are unique (as far as I know) in that
+        they allow rows to start late, like in column 3, and end early, like the last cell is in
+        column 5 in a 7 column table.
+
+        A practical example might look like this:
+
+                       +------+------+
+                       | East | West |
+            +----------+------+------+
+            | Started  |  25  |  32  |
+            +----------+------+------+
+            | Finished |  17  |  21  |
+            +----------+------+------+
+        """
+        elements = iter(partition_docx(example_doc_path("tables-with-incomplete-rows.docx")))
+
+        e = next(elements)
+        assert e.text.startswith("Example of DOCX table ")
+        # --
+        # ┌───┬───┐
+        # │ a │ b │
+        # ├───┼───┤
+        # │ c │ d │
+        # └───┴───┘
+        e = next(elements)
+        assert type(e).__name__ == "Table"
+        assert e.text == "a b c d"
+        assert e.metadata.text_as_html == (
+            "<table>\n"
+            "<thead>\n<tr><th>a  </th><th>b  </th></tr>\n</thead>\n"
+            "<tbody>\n<tr><td>c  </td><td>d  </td></tr>\n</tbody>\n"
+            "</table>"
+        )
+        # --
+        # ┌───┐
+        # │ a │
+        # ├───┼───┐
+        # │ b │ c │
+        # └───┴───┘
+        e = next(elements)
+        assert type(e).__name__ == "Table"
+        assert e.text == "a b c", f"actual {e.text=}"
+        assert e.metadata.text_as_html == (
+            "<table>\n"
+            "<thead>\n<tr><th>a  </th><th>  </th></tr>\n</thead>\n"
+            "<tbody>\n<tr><td>b  </td><td>c </td></tr>\n</tbody>\n"
+            "</table>"
+        ), f"actual {e.metadata.text_as_html=}"
+        # --
+        # ┌───────┐
+        # │   a   │
+        # ├───┬───┼───┐
+        # │ b │ c │ d │
+        # └───┴───┴───┘
+        e = next(elements)
+        assert type(e).__name__ == "Table"
+        assert e.text == "a b c d", f"actual {e.text=}"
+        assert e.metadata.text_as_html == (
+            "<table>\n"
+            "<thead>\n<tr><th>a  </th><th>a  </th><th>  </th></tr>\n</thead>\n"
+            "<tbody>\n<tr><td>b  </td><td>c  </td><td>d </td></tr>\n</tbody>\n"
+            "</table>"
+        ), f"actual {e.metadata.text_as_html=}"
+        # --
+        # ┌───┬───┐
+        # │   │ b │
+        # │ a ├───┼───┐
+        # │   │ c │ d │
+        # └───┴───┴───┘
+        e = next(elements)
+        assert type(e).__name__ == "Table"
+        assert e.text == "a b c d", f"actual {e.text=}"
+        assert e.metadata.text_as_html == (
+            "<table>\n"
+            "<thead>\n<tr><th>a  </th><th>b  </th><th>  </th></tr>\n</thead>\n"
+            "<tbody>\n<tr><td>a  </td><td>c  </td><td>d </td></tr>\n</tbody>\n"
+            "</table>"
+        ), f"actual {e.metadata.text_as_html=}"
+        # -- late-start, early-end, and >2 rows vertical span --
+        # ┌───────┬───┬───┐
+        # │   a   │ b │ c │
+        # └───┬───┴───┼───┘
+        #     │   d   │
+        # ┌───┤       ├───┐
+        # │ e │       │ f │
+        # └───┤       ├───┘
+        #     │       │
+        #     └───────┘
+        e = next(elements)
+        assert type(e).__name__ == "Table"
+        assert e.text == "a b c d e f", f"actual {e.text=}"
+        assert e.metadata.text_as_html == (
+            "<table>\n"
+            "<thead>\n"
+            "<tr><th>a  </th><th>a  </th><th>b  </th><th>c  </th></tr>\n"
+            "</thead>\n<tbody>\n"
+            "<tr><td>   </td><td>d  </td><td>d  </td><td>   </td></tr>\n"
+            "<tr><td>e  </td><td>d  </td><td>d  </td><td>f  </td></tr>\n"
+            "<tr><td>   </td><td>d  </td><td>d  </td><td>   </td></tr>\n"
+            "</tbody>\n"
+            "</table>"
+        ), f"actual {e.metadata.text_as_html=}"
+        # --
+        # -- The table from the specimen file we received with the bug report. --
+        e = next(elements)
+        assert type(e).__name__ == "Table"
+        assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}"
+        assert e.metadata.text_as_html == (
+            "<table>\n"
+            "<thead>\n"
+            "<tr><th>Data   </th><th>Data   </th><th>      </th></tr>\n"
+            "</thead>\n"
+            "<tbody>\n"
+            "<tr><td>Data   </td><td>Data   </td><td>      </td></tr>\n"
+            "<tr><td>Data   </td><td>Data   </td><td>      </td></tr>\n"
+            "<tr><td>       </td><td>More   </td><td>      </td></tr>\n"
+            "<tr><td>Dato   </td><td>       </td><td>      </td></tr>\n"
+            "<tr><td>WTF?   </td><td>WTF?   </td><td>      </td></tr>\n"
+            "<tr><td>Strange</td><td>Strange</td><td>      </td></tr>\n"
+            "<tr><td>       </td><td>Format </td><td>Format</td></tr>\n"
+            "</tbody>\n"
+            "</table>"
+        ), f"actual {e.metadata.text_as_html=}"
+
     # -- page-break behaviors --------------------------------------------------------------------
 
     def it_places_page_breaks_precisely_where_they_occur(self):
@@ -299,11 +425,7 @@ def test_parition_docx_from_team_chat():
         "0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
         "saved-by Dennis Forsythe",
     ]
-    assert [e.category for e in elements] == [
-        ElementType.UNCATEGORIZED_TEXT,
-        ElementType.UNCATEGORIZED_TEXT,
-        ElementType.TABLE,
-    ]
+    assert [type(e) for e in elements] == [Text, Text, Table]
 
 
 @pytest.mark.parametrize("infer_table_structure", [True, False])
@@ -687,7 +809,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages():
         filename = "example-docs/handbook-1p.docx"
         partition_docx(
             filename=filename,
-            languages="eng",  # pyright: ignore[reportGeneralTypeIssues]
+            languages="eng",  # pyright: ignore[reportArgumentType]
         )
 
 

diff --git a/typings/docx/oxml/table.pyi b/typings/docx/oxml/table.pyi
@@ -2,15 +2,23 @@
 
 from __future__ import annotations
 
-from typing import List
+from typing import Iterator
 
 from docx.oxml.xmlchemy import BaseOxmlElement
 
 class CT_Row(BaseOxmlElement):
-    tc_lst: List[CT_Tc]
+    tc_lst: list[CT_Tc]
+
+class CT_Tbl(BaseOxmlElement):
+    tr_lst: list[CT_Row]
+    def iter_tcs(self) -> Iterator[CT_Tc]: ...
 
 class CT_Tc(BaseOxmlElement):
+    @property
+    def grid_span(self) -> int: ...
     @property
     def vMerge(self) -> str | None: ...
-
-class CT_Tbl(BaseOxmlElement): ...
+    @property
+    def _tr(self) -> CT_Row: ...
+    @property
+    def _tr_above(self) -> CT_Row: ...
diff --git a/typings/docx/table.pyi b/typings/docx/table.pyi
@@ -18,10 +18,18 @@ class _Row(Parented):
     _tr: CT_Row
     @property
     def cells(self) -> Sequence[_Cell]: ...
+    @property
+    def table(self) -> Table: ...
+    @property
+    def _index(self) -> int: ...
 
 class _Rows(Sequence[_Row]): ...
 
 class Table(Parented):
+    _tbl: CT_Tbl
     def __init__(self, tbl: CT_Tbl, parent: BlockItemContainer) -> None: ...
+    def row_cells(self, row_idx: int) -> list[_Cell]: ...
     @property
     def rows(self) -> _Rows: ...
+    @property
+    def _column_count(self) -> int: ...
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.5"  # pragma: no cover
+__version__ = "0.13.6-dev0"  # pragma: no cover