Skip to content

Commit

Permalink
Add caption markup to table.
Browse files Browse the repository at this point in the history
  • Loading branch information
Asad Hasan committed Mar 11, 2024
1 parent 8c6d2c7 commit 681740b
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 4 deletions.
20 changes: 18 additions & 2 deletions unstructured/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
LIST_TAGS: Final[List[str]] = ["ul", "ol", "dl"]
HEADING_TAGS: Final[List[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"]
TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr"]
TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr", CAPTION_TAG]
TEXTBREAK_TAGS: Final[List[str]] = ["br"]
PAGEBREAK_TAGS: Final[List[str]] = ["hr"]
EMPTY_TAGS: Final[List[str]] = PAGEBREAK_TAGS + TEXTBREAK_TAGS
Expand Down Expand Up @@ -340,13 +340,28 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
# -- for the _cell_ containing the table (and this is recursive, so a table nested within a
# -- cell within the table within the cell too.)

table = cast(
List[etree._Element], table_elem.xpath(".")
)

trs = cast(
List[etree._Element], table_elem.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr")
)

if not trs:
return None

def iter_caption_texts(table: etree._Element) -> Iterator[str]:
"""Generate the caption texts of the table."""
# -- a cell can be either a "data" cell (td) or a "heading" cell (th) --
captions = cast(List[etree._Element], table.xpath("./caption"))
for caption in captions:
# -- a cell can contain other elements like spans etc. so we can't count on the text
# -- being directly below the `<td>` element. `.itertext()` gets all of it recursively.
# -- Filter out whitespace text nodes that result from HTML formatting.
stripped_text_nodes = (t.strip() for t in cast(Iterator[str], caption.itertext()))
yield " ".join(t for t in stripped_text_nodes if t)

def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
"""Generate the text of each cell in `tr`."""
# -- a cell can be either a "data" cell (td) or a "heading" cell (th) --
Expand All @@ -359,7 +374,8 @@ def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
yield " ".join(t for t in stripped_text_nodes if t)

table_data = [list(iter_cell_texts(tr)) for tr in trs]
html_table = htmlify_matrix_of_cell_texts(table_data)
caption_data = [list(iter_caption_texts(t)) for t in table]
html_table = htmlify_matrix_of_cell_texts(table_data, caption_data)
table_text = " ".join(" ".join(t for t in row if t) for row in table_data).strip()

if table_text == "":
Expand Down
11 changes: 9 additions & 2 deletions unstructured/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
_P = ParamSpec("_P")


def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]], table_captions: Sequence[Sequence[str]]) -> str:
"""Form an HTML table from "rows" and "columns" of `matrix`.
Character overhead is minimized:
Expand All @@ -50,6 +50,13 @@ def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
would be misleading.
"""

def iter_captions(table_captions: Sequence[Sequence[str]]) -> Iterator[str]:
for table_caption in table_captions:
for caption in table_caption:
caption = html.escape(caption)
caption = "<br/>".join(caption.split("\n"))
yield f"<caption>{''.join(caption.strip())}</caption>"

def iter_trs(rows_of_cell_strs: Sequence[Sequence[str]]) -> Iterator[str]:
for row_cell_strs in rows_of_cell_strs:
# -- suppress emission of rows with no cells --
Expand All @@ -66,7 +73,7 @@ def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
# -- strip leading and trailing whitespace, wrap it up and go --
yield f"<td>{s.strip()}</td>"

return f"<table>{''.join(iter_trs(matrix))}</table>" if matrix else ""
return f"<table>{''.join(iter_captions(table_captions))}{''.join(iter_trs(matrix))}</table>" if matrix else ""


class lazyproperty(Generic[_T]):
Expand Down

0 comments on commit 681740b

Please sign in to comment.