Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Support Indexing options for Astra DB columns #2919

Closed
Closed
15 changes: 15 additions & 0 deletions unstructured/ingest/cli/cmds/astra.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,25 @@ def get_cli_options() -> t.List[click.Option]:
),
click.Option(
["--embedding-dimension"],
required=True,
default=384,
type=int,
help="The dimensionality of the embeddings",
),
click.Option(
["--namespace"],
required=False,
default=None,
type=str,
help="The Astra DB namespace to write into.",
),
click.Option(
["--requested-indexing-policy"],
required=False,
default=None,
type=str,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

import Dict at the top
from unstructured.ingest.cli.interfaces import CliConfig, Dict

type=Dict(),
help="The indexing policy to use for the collection."
'example: \'{"blablabla":"blablabla"}\' ',

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated!

help="The indexing policy to use for the collection.",
),
]
return options

Expand Down
29 changes: 19 additions & 10 deletions unstructured/ingest/connector/astra.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,11 @@
)
from unstructured.ingest.logger import logger
from unstructured.ingest.utils.data_prep import chunk_generator
from unstructured.staging.base import flatten_dict
from unstructured.utils import requires_dependencies

if t.TYPE_CHECKING:
from astrapy.db import AstraDB, AstraDBCollection

NON_INDEXED_FIELDS = ["metadata._node_content", "content"]


@dataclass
class AstraAccessConfig(AccessConfig):
Expand All @@ -35,6 +32,8 @@ class SimpleAstraConfig(BaseConnectorConfig):
access_config: AstraAccessConfig
collection_name: str
embedding_dimension: int
namespace: t.Optional[str] = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you want them exposed to the cli, namespace and requested_indexing_policy need to be added to

https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/cli/cmds/astra.py

Which will also be helpful since they will have some documentation there.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done! i tried to match the data model to the "required" option for the CLI, hopefully this looks good.

requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None


@dataclass
Expand Down Expand Up @@ -69,21 +68,33 @@ def astra_db_collection(self) -> "AstraDBCollection":
if self._astra_db_collection is None:
from astrapy.db import AstraDB

# Get the collection_name and embedding dimension
collection_name = self.connector_config.collection_name
embedding_dimension = self.connector_config.embedding_dimension
requested_indexing_policy = self.connector_config.requested_indexing_policy

# If the user has requested an indexing policy, pass it to the AstraDB
if requested_indexing_policy is not None:
_options = {"indexing": requested_indexing_policy}
else:
_options = None

# Build the Astra DB object.
# caller_name/version for AstraDB tracking
self._astra_db = AstraDB(
api_endpoint=self.connector_config.access_config.api_endpoint,
token=self.connector_config.access_config.token,
namespace=self.connector_config.namespace,
caller_name=integration_name,
caller_version=integration_version,
)

# Create and connect to the newly created collection
self._astra_db_collection = self._astra_db.create_collection(
collection_name=self.connector_config.collection_name,
dimension=self.connector_config.embedding_dimension,
options={"indexing": {"deny": NON_INDEXED_FIELDS}},
collection_name=collection_name,
dimension=embedding_dimension,
options=_options,
)

return self._astra_db_collection

@requires_dependencies(["astrapy"], extras="astra")
Expand Down Expand Up @@ -111,7 +122,5 @@ def normalize_dict(self, element_dict: dict) -> dict:
return {
"$vector": element_dict.pop("embeddings", None),
"content": element_dict.pop("text", None),
"metadata": flatten_dict(
element_dict, separator="-", flatten_lists=True, remove_none=True
),
"metadata": element_dict,
}