Skip to content

Commit

Permalink
Add table caption support.
Browse files Browse the repository at this point in the history
  • Loading branch information
Asad Hasan committed Mar 4, 2024
1 parent 3783b44 commit f522730
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions unstructured/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
)
from unstructured.utils import htmlify_matrix_of_cell_texts

TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "font"]
TEXT_TAGS: Final[List[str]] = ["p", "a", "td", "span", "font", "caption"]
LIST_ITEM_TAGS: Final[List[str]] = ["li", "dd"]
LIST_TAGS: Final[List[str]] = ["ul", "ol", "dl"]
HEADING_TAGS: Final[List[str]] = ["h1", "h2", "h3", "h4", "h5", "h6"]
Expand Down Expand Up @@ -159,7 +159,7 @@ def _parse_pages_from_element_tree(self) -> List[Page]:
for article in articles:
descendanttag_elems: Tuple[etree._Element, ...] = ()
for tag_elem in article.iter():
if tag_elem in descendanttag_elems:
if tag_elem in descendanttag_elems and tag_elem.tag != "caption":
# Prevent repeating something that's been flagged as text as we chase it
# down a chain
continue
Expand Down

0 comments on commit f522730

Please sign in to comment.