Skip to content

Commit

Permalink
Add calculation of table related metrics based on table_as_cells (#2898)
Browse files Browse the repository at this point in the history
This pull request add metrics that are calculated based on
table_as_cells instead of text_as_html. This change is required for
comprehensive metrics calculation, as previously every colspan or
rowspan predicted was considered to be an incorrect predicted (even if
it was correct prediction)

This change has to be merged after
#2892 which
introduces table_as_cells field
  • Loading branch information
plutasnyy committed May 7, 2024
1 parent 0cd07d7 commit 4397dd6
Show file tree
Hide file tree
Showing 9 changed files with 296 additions and 34 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
## 0.13.7-dev7
## 0.13.7-dev8

### Enhancements

* **Remove `page_number` metadata fields** for HTML partition until we have a better strategy to decide page counting.
* **Extract OCRAgent.get_agent().** Generalize access to the configured OCRAgent instance beyond its use for PDFs.
* **Add calculation of table related metrics which take into account colspans and rowspans**

### Features

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,8 @@
"eng"
],
"page_number": 2,
"text_as_html": "<table><thead><th>1</th><th>Marketplace identifier</th><th>2 Marketplace-assigned policy number</th><th colspan=\"3\">3 Policy issuer’s name</th></thead><thead><th>4</th><th>Recipient’s name</th><th></th><th>5 Recipient’s SSN</th><th>6</th><th>Recipient’s date of birth</th></thead><tr><td>7</td><td>Recipient’s spouse’s name</td><td></td><td>8 Recipient’s spouse’s SSN</td><td>9</td><td>Recipient’s spouse’s date of birth</td></tr><tr><td>10</td><td>Policy start date</td><td>11 Policy termination date</td><td colspan=\"3\">12 Street address (including apartment no.)</td></tr><tr><td>13</td><td>City or town</td><td>14 State or province</td><td colspan=\"3\">15 Country and ZIP or foreign postal code</td></tr></table>"
"text_as_html": "<table><thead><th>1</th><th>Marketplace identifier</th><th>2 Marketplace-assigned policy number</th><th colspan=\"3\">3 Policy issuer’s name</th></thead><thead><th>4</th><th>Recipient’s name</th><th></th><th>5 Recipient’s SSN</th><th>6</th><th>Recipient’s date of birth</th></thead><tr><td>7</td><td>Recipient’s spouse’s name</td><td></td><td>8 Recipient’s spouse’s SSN</td><td>9</td><td>Recipient’s spouse’s date of birth</td></tr><tr><td>10</td><td>Policy start date</td><td>11 Policy termination date</td><td colspan=\"3\">12 Street address (including apartment no.)</td></tr><tr><td>13</td><td>City or town</td><td>14 State or province</td><td colspan=\"3\">15 Country and ZIP or foreign postal code</td></tr></table>",
"table_as_cells": [{"x": 0, "y": 0, "w": 1, "h": 1, "content": "1"}, {"x": 0, "y": 1, "w": 1, "h": 1, "content": "4"}, {"x": 0, "y": 2, "w": 1, "h": 1, "content": "7"}, {"x": 0, "y": 3, "w": 1, "h": 1, "content": "10"}, {"x": 0, "y": 4, "w": 1, "h": 1, "content": "13"}, {"x": 1, "y": 0, "w": 1, "h": 1, "content": "Marketplace identifier"}, {"x": 1, "y": 1, "w": 1, "h": 1, "content": "Recipient\u2019s name"}, {"x": 1, "y": 2, "w": 1, "h": 1, "content": "Recipient\u2019s spouse\u2019s name"}, {"x": 1, "y": 3, "w": 1, "h": 1, "content": "Policy start date"}, {"x": 1, "y": 4, "w": 1, "h": 1, "content": "City or town"}, {"x": 2, "y": 0, "w": 1, "h": 1, "content": "2 Marketplace-assigned policy number"}, {"x": 2, "y": 1, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 2, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 3, "w": 1, "h": 1, "content": "11 Policy termination date"}, {"x": 2, "y": 4, "w": 1, "h": 1, "content": "14 State or province"}, {"x": 3, "y": 1, "w": 1, "h": 1, "content": "5 Recipient\u2019s SSN"}, {"x": 3, "y": 2, "w": 1, "h": 1, "content": "8 Recipient\u2019s spouse\u2019s SSN"}, {"x": 4, "y": 1, "w": 1, "h": 1, "content": "6"}, {"x": 4, "y": 2, "w": 1, "h": 1, "content": "9"}, {"x": 5, "y": 1, "w": 1, "h": 1, "content": "Recipient\u2019s date of birth"}, {"x": 5, "y": 2, "w": 1, "h": 1, "content": "Recipient\u2019s spouse\u2019s date of birth"}, {"x": 3, "y": 3, "w": 3, "h": 1, "content": "12 Street address (including apartment no.)"}, {"x": 3, "y": 4, "w": 3, "h": 1, "content": "15 Country and ZIP or foreign postal code"}, {"x": 3, "y": 0, "w": 3, "h": 1, "content": "3 Policy issuer\u2019s name"}]
},
"text": "1 Marketplace identifier 2 Marketplace-assigned policy number 3 Policy issuer’s name 4 Recipient’s name 5 Recipient’s SSN 6 Recipient’s date of birth 7 Recipient’s spouse’s name 8 Recipient’s spouse’s SSN 9 Recipient’s spouse’s date of birth 10 Policy start date 11 Policy termination date 12 Street address (including apartment no.) 13 City or town 14 State or province 15 Country and ZIP or foreign postal code",
"type": "Table"
Expand Down Expand Up @@ -304,7 +305,8 @@
"eng"
],
"page_number": 2,
"text_as_html": "<table><thead><th>A. Covered individual name</th><th>B. Covered individual SSN</th><th>C. Covered individual date of birth</th><th>D. Coverage start date</th><th>E. Coverage termination date</th></thead><tr><td colspan=\"5\">16</td></tr><tr><td colspan=\"5\">17</td></tr><tr><td>18</td><td></td><td></td><td></td><td></td></tr><tr><td colspan=\"5\">19</td></tr><tr><td>20</td><td></td><td></td><td></td><td></td></tr></table>"
"text_as_html": "<table><thead><th>A. Covered individual name</th><th>B. Covered individual SSN</th><th>C. Covered individual date of birth</th><th>D. Coverage start date</th><th>E. Coverage termination date</th></thead><tr><td colspan=\"5\">16</td></tr><tr><td colspan=\"5\">17</td></tr><tr><td>18</td><td></td><td></td><td></td><td></td></tr><tr><td colspan=\"5\">19</td></tr><tr><td>20</td><td></td><td></td><td></td><td></td></tr></table>",
"table_as_cells": [{"x": 0, "y": 0, "w": 1, "h": 1, "content": "A. Covered individual name"}, {"x": 0, "y": 3, "w": 1, "h": 1, "content": "18"}, {"x": 0, "y": 5, "w": 1, "h": 1, "content": "20"}, {"x": 1, "y": 0, "w": 1, "h": 1, "content": "B. Covered individual SSN"}, {"x": 1, "y": 3, "w": 1, "h": 1, "content": ""}, {"x": 1, "y": 5, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 0, "w": 1, "h": 1, "content": "C. Covered individual date of birth"}, {"x": 2, "y": 3, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 5, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 0, "w": 1, "h": 1, "content": "D. Coverage start date"}, {"x": 3, "y": 3, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 5, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 0, "w": 1, "h": 1, "content": "E. Coverage termination date"}, {"x": 4, "y": 3, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 5, "w": 1, "h": 1, "content": ""}, {"x": 0, "y": 1, "w": 5, "h": 1, "content": "16"}, {"x": 0, "y": 4, "w": 5, "h": 1, "content": "19"}, {"x": 0, "y": 2, "w": 5, "h": 1, "content": "17"}]
},
"text": "A. Covered individual name B. Covered individual SSN C. Covered individual date of birth D. Coverage start date E. Coverage termination date 16 17 18 19 20",
"type": "Table"
Expand Down Expand Up @@ -349,7 +351,8 @@
"eng"
],
"page_number": 2,
"text_as_html": "<table><thead><th></th><th>Month A.</th><th>Monthly enrollment</th><th>premiums B. Monthly second lowest cost silver plan (SLCSP) premium</th><th>C. Monthly advance payment of premium tax credit</th></thead><tr><td>22</td><td>February</td><td></td><td></td><td></td></tr><tr><td>23</td><td>March</td><td></td><td></td><td></td></tr><tr><td>24</td><td>April</td><td></td><td></td><td></td></tr><tr><td>25</td><td>May</td><td></td><td></td><td></td></tr><tr><td>26</td><td>June</td><td></td><td></td><td></td></tr><tr><td>27</td><td>July</td><td></td><td></td><td></td></tr><tr><td>28</td><td>August</td><td></td><td></td><td></td></tr><tr><td>29</td><td>September</td><td></td><td></td><td></td></tr><tr><td>30</td><td>October</td><td></td><td></td><td></td></tr><tr><td>31</td><td>November</td><td></td><td></td><td></td></tr><tr><td>32</td><td>December</td><td></td><td></td><td></td></tr></table>"
"text_as_html": "<table><thead><th></th><th>Month A.</th><th>Monthly enrollment</th><th>premiums B. Monthly second lowest cost silver plan (SLCSP) premium</th><th>C. Monthly advance payment of premium tax credit</th></thead><tr><td>22</td><td>February</td><td></td><td></td><td></td></tr><tr><td>23</td><td>March</td><td></td><td></td><td></td></tr><tr><td>24</td><td>April</td><td></td><td></td><td></td></tr><tr><td>25</td><td>May</td><td></td><td></td><td></td></tr><tr><td>26</td><td>June</td><td></td><td></td><td></td></tr><tr><td>27</td><td>July</td><td></td><td></td><td></td></tr><tr><td>28</td><td>August</td><td></td><td></td><td></td></tr><tr><td>29</td><td>September</td><td></td><td></td><td></td></tr><tr><td>30</td><td>October</td><td></td><td></td><td></td></tr><tr><td>31</td><td>November</td><td></td><td></td><td></td></tr><tr><td>32</td><td>December</td><td></td><td></td><td></td></tr></table>",
"table_as_cells": [{"x": 0, "y": 0, "w": 1, "h": 1, "content": ""}, {"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"}, {"x": 0, "y": 2, "w": 1, "h": 1, "content": "23"}, {"x": 0, "y": 3, "w": 1, "h": 1, "content": "24"}, {"x": 0, "y": 4, "w": 1, "h": 1, "content": "25"}, {"x": 0, "y": 5, "w": 1, "h": 1, "content": "26"}, {"x": 0, "y": 6, "w": 1, "h": 1, "content": "27"}, {"x": 0, "y": 7, "w": 1, "h": 1, "content": "28"}, {"x": 0, "y": 8, "w": 1, "h": 1, "content": "29"}, {"x": 0, "y": 9, "w": 1, "h": 1, "content": "30"}, {"x": 0, "y": 10, "w": 1, "h": 1, "content": "31"}, {"x": 0, "y": 11, "w": 1, "h": 1, "content": "32"}, {"x": 1, "y": 0, "w": 1, "h": 1, "content": "Month A."}, {"x": 1, "y": 1, "w": 1, "h": 1, "content": "February"}, {"x": 1, "y": 2, "w": 1, "h": 1, "content": "March"}, {"x": 1, "y": 3, "w": 1, "h": 1, "content": "April"}, {"x": 1, "y": 4, "w": 1, "h": 1, "content": "May"}, {"x": 1, "y": 5, "w": 1, "h": 1, "content": "June"}, {"x": 1, "y": 6, "w": 1, "h": 1, "content": "July"}, {"x": 1, "y": 7, "w": 1, "h": 1, "content": "August"}, {"x": 1, "y": 8, "w": 1, "h": 1, "content": "September"}, {"x": 1, "y": 9, "w": 1, "h": 1, "content": "October"}, {"x": 1, "y": 10, "w": 1, "h": 1, "content": "November"}, {"x": 1, "y": 11, "w": 1, "h": 1, "content": "December"}, {"x": 2, "y": 0, "w": 1, "h": 1, "content": "Monthly enrollment"}, {"x": 2, "y": 1, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 2, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 3, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 4, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 5, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 6, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 7, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 8, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 9, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 10, "w": 1, "h": 1, "content": ""}, {"x": 2, "y": 11, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 0, "w": 1, "h": 1, "content": "premiums B. Monthly second lowest cost silver plan (SLCSP) premium"}, {"x": 3, "y": 1, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 2, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 3, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 4, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 5, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 6, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 7, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 8, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 9, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 10, "w": 1, "h": 1, "content": ""}, {"x": 3, "y": 11, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 0, "w": 1, "h": 1, "content": "C. Monthly advance payment of premium tax credit"}, {"x": 4, "y": 1, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 2, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 3, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 4, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 5, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 6, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 7, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 8, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 9, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 10, "w": 1, "h": 1, "content": ""}, {"x": 4, "y": 11, "w": 1, "h": 1, "content": ""}]
},
"text": "Month A. Monthly enrollment premiums B. Monthly second lowest cost silver plan (SLCSP) premium C. Monthly advance payment of premium tax credit 21 January 22 February 23 March 24 April 25 May 26 June 27 July 28 August 29 September 30 October 31 November 32 December",
"type": "Table"
Expand Down
2 changes: 1 addition & 1 deletion test_unstructured/metrics/test_evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def test_table_structure_evaluation():
assert os.path.isfile(os.path.join(export_dir, "aggregate-table-structure-accuracy.tsv"))
df = pd.read_csv(os.path.join(export_dir, "all-docs-table-structure-accuracy.tsv"), sep="\t")
assert len(df) == 1
assert len(df.columns) == 10
assert len(df.columns) == 17
assert df.iloc[0].filename == "IRS-2023-Form-1095-A.pdf"


Expand Down
109 changes: 109 additions & 0 deletions test_unstructured/metrics/test_table_structure.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,115 @@ def test_table_eval_processor_simple():
assert result.element_col_level_content_acc == 1.0


def test_table_eval_processor_simple_when_input_as_cells():
prediction = [
{
"type": "Table",
"metadata": {
"table_as_cells": [
{
"x": 1,
"y": 1,
"w": 1,
"h": 1,
"content": "r2c2",
},
{
"x": 0,
"y": 0,
"w": 1,
"h": 1,
"content": "r1c1",
},
{
"x": 0,
"y": 1,
"w": 1,
"h": 1,
"content": "r2c1",
},
{
"x": 1,
"y": 0,
"w": 1,
"h": 1,
"content": "r1c2",
},
]
},
}
]

ground_truth = [
{
"type": "Table",
"text": [
{
"id": "ee862c7a-d27e-4484-92de-4faa42a63f3b",
"x": 0,
"y": 0,
"w": 1,
"h": 1,
"content": "r1c1",
},
{
"id": "6237ac7b-bfc8-40d2-92f2-d138277205e2",
"x": 0,
"y": 1,
"w": 1,
"h": 1,
"content": "r2c1",
},
{
"id": "9d0933a9-5984-4cad-80d9-6752bf9bc4df",
"x": 1,
"y": 0,
"w": 1,
"h": 1,
"content": "r1c2",
},
{
"id": "1152d043-5ead-4ab8-8b88-888d48831ac2",
"x": 1,
"y": 1,
"w": 1,
"h": 1,
"content": "r2c2",
},
],
}
]

te_processor = TableEvalProcessor(prediction, ground_truth, source_type="cells")
result = te_processor.process_file()
assert result.total_tables == 1
assert result.table_level_acc == 1.0
assert result.element_row_level_index_acc == 1.0
assert result.element_col_level_index_acc == 1.0
assert result.element_row_level_content_acc == 1.0
assert result.element_col_level_content_acc == 1.0


def test_table_eval_processor_when_wrong_source_type():
prediction = [
{
"type": "Table",
"metadata": {"table_as_cells": []},
}
]

ground_truth = [
{
"type": "Table",
"text": [],
}
]

te_processor = TableEvalProcessor(prediction, ground_truth, source_type="wrong_type")
with pytest.raises(ValueError):
te_processor.process_file()


@pytest.mark.parametrize(
"text_as_html",
[
Expand Down
32 changes: 31 additions & 1 deletion test_unstructured/metrics/test_text_extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@
import pytest

from unstructured.metrics import text_extraction
from unstructured.metrics.table.table_extraction import (
extract_cells_from_table_as_cells,
extract_cells_from_text_as_html,
)
from unstructured.partition.auto import partition


Expand Down Expand Up @@ -155,7 +159,7 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
),
(
"""Sometimes sentences have a dash - like this one!
A hyphen connects 2 words with no gap: easy-peasy.""",
A hyphen connects 2 words with no gap: easy-peasy.""",
{
"sometimes": 1,
"sentences": 1,
Expand Down Expand Up @@ -216,3 +220,29 @@ def test_calculate_percent_missing_text(output_text, source_text, expected_perce
text_extraction.calculate_percent_missing_text(output_text, source_text)
== expected_percentage
)


def test_cells_extraction_from_prediction_when_simple_example():
example_element = {
"type": "Table",
"metadata": {
"text_as_html": "<table><thead><th>Month A.</th></thead><tr><td>22</td></tr></table>",
"table_as_cells": [
{"x": 0, "y": 0, "w": 1, "h": 1, "content": "Month A."},
{"x": 0, "y": 1, "w": 1, "h": 1, "content": "22"},
],
},
}
expected_extraction = [
{"row_index": 0, "col_index": 0, "content": "Month A."},
{"row_index": 1, "col_index": 0, "content": "22"},
]

assert extract_cells_from_text_as_html(example_element) == expected_extraction
assert extract_cells_from_table_as_cells(example_element) == expected_extraction


def test_cells_extraction_from_prediction_when_missing_prediction():
example_element = {"type": "Table", "metadata": {"text_as_html": "", "table_as_cells": []}}
assert extract_cells_from_text_as_html(example_element) is None
assert extract_cells_from_table_as_cells(example_element) is None
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.7-dev7" # pragma: no cover
__version__ = "0.13.7-dev8" # pragma: no cover
25 changes: 20 additions & 5 deletions unstructured/metrics/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,26 +331,41 @@ def measure_table_structure_accuracy(
logger.warning(f"Ground truth file {ground_truth_file} does not exist, skipping")
continue

processor = TableEvalProcessor.from_json_files(
processor_from_text_as_html = TableEvalProcessor.from_json_files(
prediction_file=prediction_file,
ground_truth_file=ground_truth_file,
cutoff=cutoff,
source_type="html",
)
report = processor.process_file()
report_from_html = processor_from_text_as_html.process_file()

processor_from_table_as_cells = TableEvalProcessor.from_json_files(
prediction_file=prediction_file,
ground_truth_file=ground_truth_file,
cutoff=cutoff,
source_type="cells",
)
report_from_cells = processor_from_table_as_cells.process_file()

rows.append(
[
out_filename,
doctype,
connector,
]
+ [getattr(report, metric) for metric in table_eval_metrics]
+ [getattr(report_from_html, metric) for metric in table_eval_metrics]
+ [getattr(report_from_cells, metric) for metric in table_eval_metrics]
)

suffixed_table_eval_metrics = [f"{metric}_with_spans" for metric in table_eval_metrics]
combined_table_metrics = table_eval_metrics + suffixed_table_eval_metrics

headers = [
"filename",
"doctype",
"connector",
] + table_eval_metrics
] + combined_table_metrics

df = pd.DataFrame(rows, columns=headers)
has_tables_df = df[df["total_tables"] > 0]

Expand All @@ -360,7 +375,7 @@ def measure_table_structure_accuracy(
).reset_index()
else:
element_metrics_results = {}
for metric in table_eval_metrics:
for metric in combined_table_metrics:
metric_df = has_tables_df[has_tables_df[metric].notnull()]
agg_metric = metric_df[metric].agg([_mean, _stdev, _pstdev, _count]).transpose()
if agg_metric.empty:
Expand Down

0 comments on commit 4397dd6

Please sign in to comment.