-
Notifications
You must be signed in to change notification settings - Fork 535
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Support Indexing options for Astra DB columns #2919
Changes from 11 commits
c814b55
35a84ea
527ea09
d194b08
7ed3c3a
ee4a5ed
3df6417
4976fb0
275674b
9c87fec
acc2edb
66bee62
78350a2
d302e7b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,14 +15,11 @@ | |
) | ||
from unstructured.ingest.logger import logger | ||
from unstructured.ingest.utils.data_prep import chunk_generator | ||
from unstructured.staging.base import flatten_dict | ||
from unstructured.utils import requires_dependencies | ||
|
||
if t.TYPE_CHECKING: | ||
from astrapy.db import AstraDB, AstraDBCollection | ||
|
||
NON_INDEXED_FIELDS = ["metadata._node_content", "content"] | ||
|
||
|
||
@dataclass | ||
class AstraAccessConfig(AccessConfig): | ||
|
@@ -35,6 +32,8 @@ class SimpleAstraConfig(BaseConnectorConfig): | |
access_config: AstraAccessConfig | ||
collection_name: str | ||
embedding_dimension: int | ||
namespace: t.Optional[str] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if you want them exposed to the cli, namespace and requested_indexing_policy need to be added to https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/ingest/cli/cmds/astra.py Which will also be helpful since they will have some documentation there. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done! i tried to match the data model to the "required" option for the CLI, hopefully this looks good. |
||
requested_indexing_policy: t.Optional[t.Dict[str, t.Any]] = None | ||
|
||
|
||
@dataclass | ||
|
@@ -69,21 +68,33 @@ def astra_db_collection(self) -> "AstraDBCollection": | |
if self._astra_db_collection is None: | ||
from astrapy.db import AstraDB | ||
|
||
# Get the collection_name and embedding dimension | ||
collection_name = self.connector_config.collection_name | ||
embedding_dimension = self.connector_config.embedding_dimension | ||
requested_indexing_policy = self.connector_config.requested_indexing_policy | ||
|
||
# If the user has requested an indexing policy, pass it to the AstraDB | ||
if requested_indexing_policy is not None: | ||
_options = {"indexing": requested_indexing_policy} | ||
else: | ||
_options = None | ||
|
||
# Build the Astra DB object. | ||
# caller_name/version for AstraDB tracking | ||
self._astra_db = AstraDB( | ||
api_endpoint=self.connector_config.access_config.api_endpoint, | ||
token=self.connector_config.access_config.token, | ||
namespace=self.connector_config.namespace, | ||
caller_name=integration_name, | ||
caller_version=integration_version, | ||
) | ||
|
||
# Create and connect to the newly created collection | ||
self._astra_db_collection = self._astra_db.create_collection( | ||
collection_name=self.connector_config.collection_name, | ||
dimension=self.connector_config.embedding_dimension, | ||
options={"indexing": {"deny": NON_INDEXED_FIELDS}}, | ||
collection_name=collection_name, | ||
dimension=embedding_dimension, | ||
options=_options, | ||
) | ||
|
||
return self._astra_db_collection | ||
|
||
@requires_dependencies(["astrapy"], extras="astra") | ||
|
@@ -111,7 +122,5 @@ def normalize_dict(self, element_dict: dict) -> dict: | |
return { | ||
"$vector": element_dict.pop("embeddings", None), | ||
"content": element_dict.pop("text", None), | ||
"metadata": flatten_dict( | ||
element_dict, separator="-", flatten_lists=True, remove_none=True | ||
), | ||
"metadata": element_dict, | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
import Dict at the top
from unstructured.ingest.cli.interfaces import CliConfig, Dict
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated!