Unstructured-IO · preemware · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024 · Mar 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@
 * **Add `.metadata.is_continuation` to text-split chunks.** `.metadata.is_continuation=True` is added to second-and-later chunks formed by text-splitting an oversized `Table` element but not to their counterpart `Text` element splits. Add this indicator for `CompositeElement` to allow text-split continuation chunks to be identified for downstream processes that may wish to skip intentionally redundant metadata values in continuation chunks.
 * **Add `compound_structure_acc` metric to table eval.** Add a new property to `unstructured.metrics.table_eval.TableEvaluation`: `composite_structure_acc`, which is computed from the element level row and column index and content accuracy scores
 * **Add `.metadata.orig_elements` to chunks.** `.metadata.orig_elements: list[Element]` is added to chunks during the chunking process (when requested) to allow access to information from the elements each chunk was formed from. This is useful for example to recover metadata fields that cannot be consolidated to a single value for a chunk, like `page_number`, `coordinates`, and `image_base64`.
+* **Add `read_and_combine_json` function**. This function reads, deserializes, and combines JSON data from multiple files in a directory into a single list of Iterable objects.
 * **Add `--include_orig_elements` option to Ingest CLI.** By default, when chunking, the original elements used to form each chunk are added to `chunk.metadata.orig_elements` for each chunk. * The `include_orig_elements` parameter allows the user to turn off this behavior to produce a smaller payload when they don't need this metadata.
 * **Add Google VertexAI embedder** Adds VertexAI embeddings to support embedding via Google Vertex AI.
 

diff --git a/docs/source/core/staging.rst b/docs/source/core/staging.rst
@@ -25,7 +25,7 @@ We can take this data and directly upload it into LabelStudio to quickly get sta
 
 
 ``convert_to_csv``
-----------------------
+--------------------
 
 Converts outputs to the initial structured data (ISD) format as a CSV string.
 
@@ -104,6 +104,40 @@ Examples:
 For more information about the ``dict_to_elements`` function, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/staging/base.py>`__.
 
 
+``read_and_combine_json``
+--------------------------
+
+Reads, deserializes, and combines JSON data from a directory into a list of Iterable objects.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.staging.base import read_and_combine_json
+
+  directory_path = "path/to/json/files"
+  combined_json_data = read_and_combine_json(directory_path)
+  elements = dict_to_elements(combined_json_data)
+  chunks = chunk_by_title(elements)
+
+
+The directory path will contain JSON files with the following format:
+
+.. code::
+
+    sample_data_1 = [
+        {"text": "Element 1 from File 1", "type": "NarrativeText"},
+        {"text": "Element 2 from File 1", "type": "Title"},
+    ]
+
+    sample_data_2 = [
+        {"text": "Element 1 from File 2", "type": "NarrativeText"},
+        {"text": "Element 2 from File 2", "type": "ListItem"},
+    ]
+
+For more information about the ``read_and_combine_json`` function, you can check the `source code here <https://github.com/Unstructured-IO/unstructured/blob/a583d47b841bdd426b9058b7c34f6aa3ed8de152/unstructured/staging/base.py>`__.
+
+
 ``stage_csv_for_prodigy``
 --------------------------
 

diff --git a/docs/source/examples/dict_to_elements.rst b/docs/source/examples/dict_to_elements.rst
@@ -51,8 +51,8 @@ Set up the API key and S3 URL for accessing the data.
 .. code-block:: python
 
    UNSTRUCTURED_API_KEY = os.getenv('UNSTRUCTURED_API_KEY')
-   S3_URL = "s3://rh-financial-reports/world-development-bank-2023/"
 
+   S3_URL = "s3://rh-financial-reports/world-development-bank-2023/"
 
 Step 4: Python Runner
 ---------------------
@@ -90,8 +90,9 @@ Combine JSON files into a single dataset for further processing.
 
 .. code-block:: python
 
-   combined_json_data = read_and_combine_json("Connector-Output/world-development-bank-2023")
+   from unstructured.staging.base import read_and_combine_json
 
+   combined_json_data = read_and_combine_json("Connector-Output/world-development-bank-2023")
 
 Step 6: Convert into Unstructured Elements for Chunking
 -------------------------------------------------------
@@ -103,7 +104,6 @@ Convert the combined JSON data into Unstructured Elements and apply chunking by
    elements = dict_to_elements(combined_json_data)
    chunks = chunk_by_title(elements)
 
-
 Conclusion
 **********
 

diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py
@@ -82,6 +82,44 @@ def test_elements_from_dicts():
     ]
 
 
+def test_read_and_combine_json(tmp_path: str):
+    sample_data_1 = [
+        {"text": "Element 1 from File 1", "type": "NarrativeText"},
+        {"text": "Element 2 from File 1", "type": "Title"},
+    ]
+    sample_data_2 = [
+        {"text": "Element 1 from File 2", "type": "NarrativeText"},
+        {"text": "Element 2 from File 2", "type": "ListItem"},
+    ]
+
+    file_path_1 = tmp_path / "sample_1.json"
+    file_path_2 = tmp_path / "sample_2.json"
+
+    # Ensure the directory exists
+    pathlib.Path(tmp_path).mkdir(parents=True, exist_ok=True)
+
+    with open(file_path_1, "w", encoding="utf-8") as f:
+        json.dump(sample_data_1, f)
+    with open(file_path_2, "w", encoding="utf-8") as f:
+        json.dump(sample_data_2, f)
+
+    combined_elements = base.read_and_combine_json(str(tmp_path))
+    assert len(combined_elements) == 4, "Expected 4 combined elements"
+
+    expected_texts = [
+        "Element 1 from File 1",
+        "Element 2 from File 1",
+        "Element 1 from File 2",
+        "Element 2 from File 2",
+    ]
+
+    combined_texts = [element["text"] for element in combined_elements]
+    for expected_text in expected_texts:
+        assert (
+            expected_text in combined_texts
+        ), f"Expected text '{expected_text}' in combined elements"
+
+
 def test_convert_to_csv(tmp_path: str):
     output_csv_path = os.path.join(tmp_path, "isd_data.csv")
     elements = [Title(text="Title 1"), NarrativeText(text="Narrative 1")]

diff --git a/unstructured/staging/base.py b/unstructured/staging/base.py
@@ -4,6 +4,7 @@
 import csv
 import io
 import json
+import os
 import zlib
 from copy import deepcopy
 from datetime import datetime
@@ -94,6 +95,25 @@ def elements_from_json(
     return elements_from_dicts(element_dicts)
 
 
+def read_and_combine_json(directory_path: str, encoding: str = "utf-8") -> Iterable[dict[str, Any]]:
+    """
+    Reads all JSON files in a given directory, deserializes their content into Element objects,
+    and combines them into an iterable of dictionaries, including each element's ID.
+    """
+    combined_elements: list[dict[str, Any]] = []
+
+    for filename in os.listdir(directory_path):
+        if filename.endswith(".json"):
+            full_path = os.path.join(directory_path, filename)
+            try:
+                with open(full_path, encoding=encoding) as file:
+                    element_dicts = json.load(file)
+                    combined_elements.extend(element_dicts)
+            except Exception as e:
+                print(f"Error reading or parsing file {full_path}: {e}")
+    return combined_elements
+
+
 # == SERIALIZERS =================================