Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for table caption #2611

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
32 changes: 19 additions & 13 deletions unstructured/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
NarrativeText,
Table,
Text,
Title,
Title
)
from unstructured.documents.xml import VALID_PARSERS, XMLDocument
from unstructured.logger import logger
Expand All @@ -37,11 +37,14 @@
)
from unstructured.utils import htmlify_matrix_of_cell_texts

CAPTION_TAG: str = "caption"
CAPTION_TAG_START: str = f"<{CAPTION_TAG}>"
CAPTION_TAG_END: str = f"</{CAPTION_TAG}>"
TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "font"]
LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
LIST_TAGS: Final[List[str]] = ["ul", "ol", "dl"]
HEADING_TAGS: Final[List[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"]
TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr"]
TABLE_TAGS: Final[List[str]] = ["table", "tbody", "td", "tr", CAPTION_TAG]
TEXTBREAK_TAGS: Final[List[str]] = ["br"]
PAGEBREAK_TAGS: Final[List[str]] = ["hr"]
EMPTY_TAGS: Final[List[str]] = PAGEBREAK_TAGS + TEXTBREAK_TAGS
Expand Down Expand Up @@ -103,7 +106,6 @@ class HTMLListItem(TagsMixin, ListItem):
class HTMLTable(TagsMixin, Table):
"""NarrativeText with tag information"""


def has_table_ancestor(element: TagsMixin) -> bool:
"""Checks to see if an element has ancestors that are table elements. If so, we consider
it to be a table element rather than a section of narrative text."""
Expand Down Expand Up @@ -337,7 +339,7 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
# -- cell within the table within the cell too.)

trs = cast(
List[etree._Element], table_elem.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr")
List[etree._Element], table_elem.xpath("./tr | ./thead/tr | ./tbody/tr | ./tfoot/tr | ./caption")
)

if not trs:
Expand All @@ -346,17 +348,21 @@ def _parse_HTMLTable_from_table_elem(table_elem: etree._Element) -> Optional[Ele
def iter_cell_texts(tr: etree._Element) -> Iterator[str]:
"""Generate the text of each cell in `tr`."""
# -- a cell can be either a "data" cell (td) or a "heading" cell (th) --
tds = cast(List[etree._Element], tr.xpath("./td | ./th"))
for td in tds:
# -- a cell can contain other elements like spans etc. so we can't count on the text
# -- being directly below the `<td>` element. `.itertext()` gets all of it recursively.
# -- Filter out whitespace text nodes that result from HTML formatting.
stripped_text_nodes = (t.strip() for t in cast(Iterator[str], td.itertext()))
yield " ".join(t for t in stripped_text_nodes if t)
if tr.tag == "caption":
stripped_text_nodes = (t.strip() for t in cast(Iterator[str], tr.itertext()))
yield " ".join(CAPTION_TAG_START+t+CAPTION_TAG_END for t in stripped_text_nodes if t)
else:
tds = cast(List[etree._Element], tr.xpath("./td | ./th"))
for td in tds:
# -- a cell can contain other elements like spans etc. so we can't count on the text
# -- being directly below the `<td>` element. `.itertext()` gets all of it recursively.
# -- Filter out whitespace text nodes that result from HTML formatting.
stripped_text_nodes = (t.strip() for t in cast(Iterator[str], td.itertext()))
yield " ".join(t for t in stripped_text_nodes if t)

table_data = [list(iter_cell_texts(tr)) for tr in trs]
html_table = htmlify_matrix_of_cell_texts(table_data)
table_text = " ".join(" ".join(t for t in row if t) for row in table_data).strip()
html_table = htmlify_matrix_of_cell_texts(table_data, CAPTION_TAG_START)
table_text = " ".join(" ".join(t.replace(CAPTION_TAG_START, "\n").replace(CAPTION_TAG_END, "\n") for t in row if t) for row in table_data).strip()

if table_text == "":
return None
Expand Down
28 changes: 17 additions & 11 deletions unstructured/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
_P = ParamSpec("_P")


def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]]) -> str:
def htmlify_matrix_of_cell_texts(matrix: Sequence[Sequence[str]], CAPTION_TAG_START: str) -> str:
"""Form an HTML table from "rows" and "columns" of `matrix`.
Character overhead is minimized:
Expand All @@ -55,16 +55,22 @@ def iter_trs(rows_of_cell_strs: Sequence[Sequence[str]]) -> Iterator[str]:
# -- suppress emission of rows with no cells --
if not row_cell_strs:
continue
yield f"<tr>{''.join(iter_tds(row_cell_strs))}</tr>"

def iter_tds(row_cell_strs: Sequence[str]) -> Iterator[str]:
for s in row_cell_strs:
# -- take care of things like '<' and '>' in the text --
s = html.escape(s)
# -- substitute <br/> elements for line-feeds in the text --
s = "<br/>".join(s.split("\n"))
# -- strip leading and trailing whitespace, wrap it up and go --
yield f"<td>{s.strip()}</td>"
tds = ""
for s in row_cell_strs:
if s.find(CAPTION_TAG_START) == 0:
yield s
else:
tds += iter_tds(s)
if tds != "":
yield f"<tr>{tds}</tr>"

def iter_tds(s: str) -> str:
# -- take care of things like '<' and '>' in the text --
s = html.escape(s)
# -- substitute <br/> elements for line-feeds in the text --
s = "<br/>".join(s.split("\n"))
# -- strip leading and trailing whitespace, wrap it up and go --
return f"<td>{s.strip()}</td>"

return f"<table>{''.join(iter_trs(matrix))}</table>" if matrix else ""

Expand Down