Skip to content

Commit

Permalink
fix(docx): fix short-row DOCX table (#2943)
Browse files Browse the repository at this point in the history
**Summary**
The DOCX format allows a table row to start late and/or end early,
meaning cells at the beginning or end of a row can be omitted. While
there are legitimate uses for this capability, using it in practice is
relatively rare. However, it can happen unintentionally when adjusting
cell borders with the mouse. Accommodate this case and generate accurate
`.text` and `.metadata.text_as_html` for these tables.
  • Loading branch information
scanny committed May 2, 2024
1 parent eff84af commit 601594d
Show file tree
Hide file tree
Showing 32 changed files with 157 additions and 340 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.13.7-dev1
## 0.13.7-dev2

### Enhancements

Expand All @@ -8,6 +8,8 @@

### Fixes

* **`partition_docx()` handles short table rows.** The DOCX format allows a table row to start late and/or end early, meaning cells at the beginning or end of a row can be omitted. While there are legitimate uses for this capability, using it in practice is relatively rare. However, it can happen unintentionally when adjusting cell borders with the mouse. Accommodate this case and generate accurate `.text` and `.metadata.text_as_html` for these tables.

## 0.13.6

### Enhancements
Expand Down
Binary file added example-docs/tables-with-incomplete-rows.docx
Binary file not shown.
136 changes: 129 additions & 7 deletions test_unstructured/partition/docx/test_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
Address,
CompositeElement,
Element,
ElementType,
Footer,
Header,
ListItem,
Expand Down Expand Up @@ -132,6 +131,133 @@ def but_the_text_of_a_merged_cell_appears_only_once(self):
table = docx.Document(example_doc_path("docx-tables.docx")).tables[2]
assert " ".join(_DocxPartitioner()._iter_table_texts(table)) == "a b c d e"

def it_can_partition_tables_with_incomplete_rows(self):
"""DOCX permits table rows to start late and end early.
It is relatively rare in the wild, but DOCX tables are unique (as far as I know) in that
they allow rows to start late, like in column 3, and end early, like the last cell is in
column 5 of a 7 column table.
A practical example might look like this:
+------+------+
| East | West |
+----------+------+------+
| Started | 25 | 32 |
+----------+------+------+
| Finished | 17 | 21 |
+----------+------+------+
"""
elements = iter(partition_docx(example_doc_path("tables-with-incomplete-rows.docx")))

e = next(elements)
assert e.text.startswith("Example of DOCX table ")
# --
# ┌───┬───┐
# │ a │ b │
# ├───┼───┤
# │ c │ d │
# └───┴───┘
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "a b c d"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th>b </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>c </td><td>d </td></tr>\n</tbody>\n"
"</table>"
)
# --
# ┌───┐
# │ a │
# ├───┼───┐
# │ b │ c │
# └───┴───┘
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "a b c", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th> </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>b </td><td>c </td></tr>\n</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# --
# ┌───────┐
# │ a │
# ├───┬───┼───┐
# │ b │ c │ d │
# └───┴───┴───┘
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "a b c d", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th>a </th><th> </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>b </td><td>c </td><td>d </td></tr>\n</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# --
# ┌───┬───┐
# │ │ b │
# │ a ├───┼───┐
# │ │ c │ d │
# └───┴───┴───┘
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "a b c d", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n<tr><th>a </th><th>b </th><th> </th></tr>\n</thead>\n"
"<tbody>\n<tr><td>a </td><td>c </td><td>d </td></tr>\n</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# -- late-start, early-end, and >2 rows vertical span --
# ┌───────┬───┬───┐
# │ a │ b │ c │
# └───┬───┴───┼───┘
# │ d │
# ┌───┤ ├───┐
# │ e │ │ f │
# └───┤ ├───┘
# │ │
# └───────┘
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "a b c d e f", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>a </th><th>a </th><th>b </th><th>c </th></tr>\n"
"</thead>\n<tbody>\n"
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
"<tr><td>e </td><td>d </td><td>d </td><td>f </td></tr>\n"
"<tr><td> </td><td>d </td><td>d </td><td> </td></tr>\n"
"</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}"
# --
# -- The table from the specimen file we received with the bug report. --
e = next(elements)
assert type(e).__name__ == "Table"
assert e.text == "Data More Dato WTF? Strange Format", f"actual {e.text=}"
assert e.metadata.text_as_html == (
"<table>\n"
"<thead>\n"
"<tr><th>Data </th><th>Data </th><th> </th></tr>\n"
"</thead>\n"
"<tbody>\n"
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
"<tr><td>Data </td><td>Data </td><td> </td></tr>\n"
"<tr><td> </td><td>More </td><td> </td></tr>\n"
"<tr><td>Dato </td><td> </td><td> </td></tr>\n"
"<tr><td>WTF? </td><td>WTF? </td><td> </td></tr>\n"
"<tr><td>Strange</td><td>Strange</td><td> </td></tr>\n"
"<tr><td> </td><td>Format </td><td>Format</td></tr>\n"
"</tbody>\n"
"</table>"
), f"actual {e.metadata.text_as_html=}"

# -- page-break behaviors --------------------------------------------------------------------

def it_places_page_breaks_precisely_where_they_occur(self):
Expand Down Expand Up @@ -299,11 +425,7 @@ def test_parition_docx_from_team_chat():
"0:0:3.270 --> 0:0:4.250\nJames Bond\nUmm.",
"saved-by Dennis Forsythe",
]
assert [e.category for e in elements] == [
ElementType.UNCATEGORIZED_TEXT,
ElementType.UNCATEGORIZED_TEXT,
ElementType.TABLE,
]
assert [type(e) for e in elements] == [Text, Text, Table]


@pytest.mark.parametrize("infer_table_structure", [True, False])
Expand Down Expand Up @@ -687,7 +809,7 @@ def test_partition_docx_raises_TypeError_for_invalid_languages():
filename = "example-docs/handbook-1p.docx"
partition_docx(
filename=filename,
languages="eng", # pyright: ignore[reportGeneralTypeIssues]
languages="eng", # pyright: ignore[reportArgumentType]
)


Expand Down
3 changes: 0 additions & 3 deletions typings/docx/__init__.pyi

This file was deleted.

5 changes: 0 additions & 5 deletions typings/docx/api.pyi

This file was deleted.

13 changes: 0 additions & 13 deletions typings/docx/blkcntnr.pyi

This file was deleted.

28 changes: 0 additions & 28 deletions typings/docx/document.pyi

This file was deleted.

1 change: 0 additions & 1 deletion typings/docx/drawing.pyi

This file was deleted.

11 changes: 0 additions & 11 deletions typings/docx/enum/section.pyi

This file was deleted.

7 changes: 0 additions & 7 deletions typings/docx/oxml/__init__.pyi

This file was deleted.

10 changes: 0 additions & 10 deletions typings/docx/oxml/document.pyi

This file was deleted.

5 changes: 0 additions & 5 deletions typings/docx/oxml/ns.pyi

This file was deleted.

7 changes: 0 additions & 7 deletions typings/docx/oxml/section.pyi

This file was deleted.

16 changes: 0 additions & 16 deletions typings/docx/oxml/table.pyi

This file was deleted.

9 changes: 0 additions & 9 deletions typings/docx/oxml/text/hyperlink.pyi

This file was deleted.

3 changes: 0 additions & 3 deletions typings/docx/oxml/text/pagebreak.pyi

This file was deleted.

3 changes: 0 additions & 3 deletions typings/docx/oxml/text/paragraph.pyi

This file was deleted.

4 changes: 0 additions & 4 deletions typings/docx/oxml/text/parfmt.pyi

This file was deleted.

30 changes: 0 additions & 30 deletions typings/docx/oxml/text/run.pyi

This file was deleted.

17 changes: 0 additions & 17 deletions typings/docx/oxml/xmlchemy.pyi

This file was deleted.

0 comments on commit 601594d

Please sign in to comment.