Skip to content

Commit

Permalink
Support txtai as a vector store (#10240)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarouaneMaatouk committed Jan 24, 2024
1 parent b10d26e commit 773a2fd
Show file tree
Hide file tree
Showing 11 changed files with 738 additions and 0 deletions.
16 changes: 16 additions & 0 deletions docs/community/integrations/vector_stores.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ as the storage backend for `VectorStoreIndex`.
- Elasticsearch (`ElasticsearchStore`) [Installation](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html)
- Epsilla (`EpsillaVectorStore`) [Installation/Quickstart](https://epsilla-inc.gitbook.io/epsilladb/quick-start)
- Faiss (`FaissVectorStore`). [Installation](https://github.com/facebookresearch/faiss/blob/main/INSTALL.md).
- txtai (`TxtaiVectorStore`). [Installation](https://neuml.github.io/txtai/install/).
- Jaguar (`JaguarVectorStore`). [Installation](http://www.jaguardb.com/docsetup.html).
- Lantern (`LanternVectorStore`). [Quickstart](https://docs.lantern.dev/get-started/overview).
- Milvus (`MilvusVectorStore`). [Installation](https://milvus.io/docs)
Expand Down Expand Up @@ -285,6 +286,21 @@ vector_store = FaissVectorStore(faiss_index)
storage_context.persist()
```

**txtai**

```python
import txtai
from llama_index.vector_stores import TxtaiVectorStore

# create txtai index
txtai_index = txtai.ann.ANNFactory.create(
{"backend": "numpy", "dimension": 512}
)

# construct vector store
vector_store = TxtaiVectorStore(txtai_index)
```

**Jaguar**

```python
Expand Down
249 changes: 249 additions & 0 deletions docs/examples/vector_stores/TxtaiIndexDemo.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "d7e0d5b7",
"metadata": {},
"source": [
"<a href=\"https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/vector_stores/FaissIndexDemo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "307804a3-c02b-4a57-ac0d-172c30ddc851",
"metadata": {},
"source": [
"# txtai Vector Store"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "380a9254",
"metadata": {},
"source": [
"If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "375ec23d",
"metadata": {},
"outputs": [],
"source": [
"!pip install llama-index"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "f7010b1d-d1bb-4f08-9309-a328bb4ea396",
"metadata": {},
"source": [
"#### Creating a Faiss Index"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1b5e530",
"metadata": {},
"outputs": [],
"source": [
"import logging\n",
"import sys\n",
"\n",
"logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
"logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0c9f4d21-145a-401e-95ff-ccb259e8ef84",
"metadata": {},
"outputs": [],
"source": [
"import txtai\n",
"\n",
"# Create txtai ann index\n",
"txtai_index = txtai.ann.ANNFactory.create({\"backend\": \"numpy\"})"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "8ee4473a-094f-4d0a-a825-e1213db07240",
"metadata": {},
"source": [
"#### Load documents, build the VectorStoreIndex"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0a2bcc07",
"metadata": {},
"outputs": [],
"source": [
"from llama_index import (\n",
" SimpleDirectoryReader,\n",
" load_index_from_storage,\n",
" VectorStoreIndex,\n",
" StorageContext,\n",
")\n",
"from llama_index.vector_stores.txtai import TxtaiVectorStore\n",
"from IPython.display import Markdown, display"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "9096dae7",
"metadata": {},
"source": [
"Download Data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "335923ad",
"metadata": {},
"outputs": [],
"source": [
"!mkdir -p 'data/paul_graham/'\n",
"!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "68cbd239-880e-41a3-98d8-dbb3fab55431",
"metadata": {},
"outputs": [],
"source": [
"# load documents\n",
"documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ba1558b3",
"metadata": {},
"outputs": [],
"source": [
"vector_store = TxtaiVectorStore(txtai_index=txtai_index)\n",
"storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
"index = VectorStoreIndex.from_documents(\n",
" documents, storage_context=storage_context\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c36cadc1",
"metadata": {},
"outputs": [],
"source": [
"# save index to disk\n",
"index.storage_context.persist()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70b372a7",
"metadata": {},
"outputs": [],
"source": [
"# load index from disk\n",
"vector_store = TxtaiVectorStore.from_persist_dir(\"./storage\")\n",
"storage_context = StorageContext.from_defaults(\n",
" vector_store=vector_store, persist_dir=\"./storage\"\n",
")\n",
"index = load_index_from_storage(storage_context=storage_context)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "04304299-fc3e-40a0-8600-f50c3292767e",
"metadata": {},
"source": [
"#### Query Index"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "35369eda",
"metadata": {},
"outputs": [],
"source": [
"# set Logging to DEBUG for more detailed outputs\n",
"query_engine = index.as_query_engine()\n",
"response = query_engine.query(\"What did the author do growing up?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bedbb693-725f-478f-be26-fa7180ea38b2",
"metadata": {},
"outputs": [],
"source": [
"display(Markdown(f\"<b>{response}</b>\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "99212d33",
"metadata": {},
"outputs": [],
"source": [
"# set Logging to DEBUG for more detailed outputs\n",
"query_engine = index.as_query_engine()\n",
"response = query_engine.query(\n",
" \"What did the author do after his time at Y Combinator?\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a720ad6",
"metadata": {},
"outputs": [],
"source": [
"display(Markdown(f\"<b>{response}</b>\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
1 change: 1 addition & 0 deletions docs/module_guides/storing/vector_stores.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ We are actively adding more integrations and improving feature coverage for each
| DynamoDB | cloud | | || | |
| Elasticsearch | self-hosted / cloud ||||||
| FAISS | in-memory | | | | | |
| txtai | in-memory | | | | | |
| Jaguar | self-hosted / cloud ||||| |
| LanceDB | cloud || ||| |
| Lantern | self-hosted / cloud ||||||
Expand Down
2 changes: 2 additions & 0 deletions llama_index/readers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
from llama_index.readers.steamship.file_reader import SteamshipFileReader
from llama_index.readers.string_iterable import StringIterableReader
from llama_index.readers.twitter import TwitterTweetReader
from llama_index.readers.txtai import TxtaiReader
from llama_index.readers.weaviate.reader import WeaviateReader
from llama_index.readers.web import (
BeautifulSoupWebReader,
Expand Down Expand Up @@ -77,6 +78,7 @@
"ChromaReader",
"DeepLakeReader",
"FaissReader",
"TxtaiReader",
"MyScaleReader",
"Document",
"StringIterableReader",
Expand Down
77 changes: 77 additions & 0 deletions llama_index/readers/txtai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""txtai reader."""

from typing import Any, Dict, List

import numpy as np

from llama_index.readers.base import BaseReader
from llama_index.schema import Document


class TxtaiReader(BaseReader):
"""txtai reader.
Retrieves documents through an existing in-memory txtai index.
These documents can then be used in a downstream LlamaIndex data structure.
If you wish use txtai itself as an index to to organize documents,
insert documents, and perform queries on them, please use VectorStoreIndex
with TxtaiVectorStore.
Args:
txtai_index (txtai.ann.ANN): A txtai Index object (required)
"""

def __init__(self, index: Any):
"""Initialize with parameters."""
import_err_msg = """
`txtai` package not found. For instructions on
how to install `txtai` please visit
https://neuml.github.io/txtai/install/
"""
try:
import txtai # noqa
except ImportError:
raise ImportError(import_err_msg)

self._index = index

def load_data(
self,
query: np.ndarray,
id_to_text_map: Dict[str, str],
k: int = 4,
separate_documents: bool = True,
) -> List[Document]:
"""Load data from txtai index.
Args:
query (np.ndarray): A 2D numpy array of query vectors.
id_to_text_map (Dict[str, str]): A map from ID's to text.
k (int): Number of nearest neighbors to retrieve. Defaults to 4.
separate_documents (Optional[bool]): Whether to return separate
documents. Defaults to True.
Returns:
List[Document]: A list of documents.
"""
search_result = self._index.search(query, k)
documents = []
for query_result in search_result:
for doc_id, _ in query_result:
doc_id = str(doc_id)
if doc_id not in id_to_text_map:
raise ValueError(
f"Document ID {doc_id} not found in id_to_text_map."
)
text = id_to_text_map[doc_id]
documents.append(Document(text=text))

if not separate_documents:
# join all documents into one
text_list = [doc.get_content() for doc in documents]
text = "\n\n".join(text_list)
documents = [Document(text=text)]

return documents
2 changes: 2 additions & 0 deletions llama_index/vector_stores/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from llama_index.vector_stores.tair import TairVectorStore
from llama_index.vector_stores.tencentvectordb import TencentVectorDB
from llama_index.vector_stores.timescalevector import TimescaleVectorStore
from llama_index.vector_stores.txtai import TxtaiVectorStore
from llama_index.vector_stores.types import (
ExactMatchFilter,
FilterCondition,
Expand All @@ -61,6 +62,7 @@
"RedisVectorStore",
"RocksetVectorStore",
"FaissVectorStore",
"TxtaiVectorStore",
"PineconeVectorStore",
"WeaviateVectorStore",
"QdrantVectorStore",
Expand Down

0 comments on commit 773a2fd

Please sign in to comment.