Skip to content

Commit

Permalink
Mark caption tags of Caption type as opposed to NarrativeText
Browse files Browse the repository at this point in the history
  • Loading branch information
Asad Hasan committed Mar 5, 2024
1 parent e915d49 commit ae81819
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 0 deletions.
5 changes: 5 additions & 0 deletions unstructured/documents/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,11 @@ class Table(Text):

category = "Table"

class Caption(Text):
"""An element for capturing captions."""

category = "Caption"


class TableChunk(Table):
"""An element for capturing chunks of tables."""
Expand Down
12 changes: 12 additions & 0 deletions unstructured/documents/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
Table,
Text,
Title,
Caption
)
from unstructured.documents.xml import VALID_PARSERS, XMLDocument
from unstructured.logger import logger
Expand Down Expand Up @@ -104,6 +105,8 @@ class HTMLListItem(TagsMixin, ListItem):
class HTMLTable(TagsMixin, Table):
"""NarrativeText with tag information"""

class HTMLCaption(TagsMixin, Caption):
"""NarrativeText with tag information"""

def has_table_ancestor(element: TagsMixin) -> bool:
"""Checks to see if an element has ancestors that are table elements. If so, we consider
Expand Down Expand Up @@ -466,6 +469,15 @@ def _text_to_element(
links=links,
emphasized_texts=emphasized_texts,
)

if tag == CAPTION_TAG:
return HTMLCaption(
text,
tag=tag,
ancestortags=ancestortags,
links=links,
emphasized_texts=emphasized_texts,
)

if len(text) < 2:
return None
Expand Down

0 comments on commit ae81819

Please sign in to comment.