Support txtai as a vector store (#10240)

run-llama · Jan 24, 2024 · 773a2fd · 773a2fd
1 parent b10d26e
commit 773a2fd
Show file tree

Hide file tree

Showing 11 changed files with 738 additions and 0 deletions.
diff --git a/docs/community/integrations/vector_stores.md b/docs/community/integrations/vector_stores.md
@@ -22,6 +22,7 @@ as the storage backend for `VectorStoreIndex`.
 - Elasticsearch (`ElasticsearchStore`) [Installation](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html)
 - Epsilla (`EpsillaVectorStore`) [Installation/Quickstart](https://epsilla-inc.gitbook.io/epsilladb/quick-start)
 - Faiss (`FaissVectorStore`). [Installation](https://github.com/facebookresearch/faiss/blob/main/INSTALL.md).
+- txtai (`TxtaiVectorStore`). [Installation](https://neuml.github.io/txtai/install/).
 - Jaguar (`JaguarVectorStore`). [Installation](http://www.jaguardb.com/docsetup.html).
 - Lantern (`LanternVectorStore`). [Quickstart](https://docs.lantern.dev/get-started/overview).
 - Milvus (`MilvusVectorStore`). [Installation](https://milvus.io/docs)
@@ -285,6 +286,21 @@ vector_store = FaissVectorStore(faiss_index)
 storage_context.persist()
 ```
 
+**txtai**
+
+```python
+import txtai
+from llama_index.vector_stores import TxtaiVectorStore
+
+# create txtai index
+txtai_index = txtai.ann.ANNFactory.create(
+    {"backend": "numpy", "dimension": 512}
+)
+
+# construct vector store
+vector_store = TxtaiVectorStore(txtai_index)
+```
+
 **Jaguar**
 
 ```python

diff --git a/docs/examples/vector_stores/TxtaiIndexDemo.ipynb b/docs/examples/vector_stores/TxtaiIndexDemo.ipynb
@@ -0,0 +1,249 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "d7e0d5b7",
+   "metadata": {},
+   "source": [
+    "<a href=\"https://colab.research.google.com/github/run-llama/llama_index/blob/main/docs/examples/vector_stores/FaissIndexDemo.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "307804a3-c02b-4a57-ac0d-172c30ddc851",
+   "metadata": {},
+   "source": [
+    "# txtai Vector Store"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "380a9254",
+   "metadata": {},
+   "source": [
+    "If you're opening this Notebook on colab, you will probably need to install LlamaIndex 🦙."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "375ec23d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install llama-index"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "f7010b1d-d1bb-4f08-9309-a328bb4ea396",
+   "metadata": {},
+   "source": [
+    "#### Creating a Faiss Index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1b5e530",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import sys\n",
+    "\n",
+    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
+    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c9f4d21-145a-401e-95ff-ccb259e8ef84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import txtai\n",
+    "\n",
+    "# Create txtai ann index\n",
+    "txtai_index = txtai.ann.ANNFactory.create({\"backend\": \"numpy\"})"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "8ee4473a-094f-4d0a-a825-e1213db07240",
+   "metadata": {},
+   "source": [
+    "#### Load documents, build the VectorStoreIndex"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0a2bcc07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index import (\n",
+    "    SimpleDirectoryReader,\n",
+    "    load_index_from_storage,\n",
+    "    VectorStoreIndex,\n",
+    "    StorageContext,\n",
+    ")\n",
+    "from llama_index.vector_stores.txtai import TxtaiVectorStore\n",
+    "from IPython.display import Markdown, display"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "9096dae7",
+   "metadata": {},
+   "source": [
+    "Download Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "335923ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p 'data/paul_graham/'\n",
+    "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68cbd239-880e-41a3-98d8-dbb3fab55431",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load documents\n",
+    "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ba1558b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vector_store = TxtaiVectorStore(txtai_index=txtai_index)\n",
+    "storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
+    "index = VectorStoreIndex.from_documents(\n",
+    "    documents, storage_context=storage_context\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c36cadc1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save index to disk\n",
+    "index.storage_context.persist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70b372a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load index from disk\n",
+    "vector_store = TxtaiVectorStore.from_persist_dir(\"./storage\")\n",
+    "storage_context = StorageContext.from_defaults(\n",
+    "    vector_store=vector_store, persist_dir=\"./storage\"\n",
+    ")\n",
+    "index = load_index_from_storage(storage_context=storage_context)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "04304299-fc3e-40a0-8600-f50c3292767e",
+   "metadata": {},
+   "source": [
+    "#### Query Index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35369eda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set Logging to DEBUG for more detailed outputs\n",
+    "query_engine = index.as_query_engine()\n",
+    "response = query_engine.query(\"What did the author do growing up?\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bedbb693-725f-478f-be26-fa7180ea38b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display(Markdown(f\"<b>{response}</b>\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "99212d33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set Logging to DEBUG for more detailed outputs\n",
+    "query_engine = index.as_query_engine()\n",
+    "response = query_engine.query(\n",
+    "    \"What did the author do after his time at Y Combinator?\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a720ad6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "display(Markdown(f\"<b>{response}</b>\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/module_guides/storing/vector_stores.md b/docs/module_guides/storing/vector_stores.md
@@ -27,6 +27,7 @@ We are actively adding more integrations and improving feature coverage for each
 | DynamoDB                 | cloud               |                    |               | ✓      |                 |       |
 | Elasticsearch            | self-hosted / cloud | ✓                  | ✓             | ✓      | ✓               | ✓     |
 | FAISS                    | in-memory           |                    |               |        |                 |       |
+| txtai                    | in-memory           |                    |               |        |                 |       |
 | Jaguar                   | self-hosted / cloud | ✓                  | ✓             | ✓      | ✓               |       |
 | LanceDB                  | cloud               | ✓                  |               | ✓      | ✓               |       |
 | Lantern                  | self-hosted / cloud | ✓                  | ✓             | ✓      | ✓               | ✓     |

diff --git a/llama_index/readers/__init__.py b/llama_index/readers/__init__.py
@@ -46,6 +46,7 @@
 from llama_index.readers.steamship.file_reader import SteamshipFileReader
 from llama_index.readers.string_iterable import StringIterableReader
 from llama_index.readers.twitter import TwitterTweetReader
+from llama_index.readers.txtai import TxtaiReader
 from llama_index.readers.weaviate.reader import WeaviateReader
 from llama_index.readers.web import (
     BeautifulSoupWebReader,
@@ -77,6 +78,7 @@
     "ChromaReader",
     "DeepLakeReader",
     "FaissReader",
+    "TxtaiReader",
     "MyScaleReader",
     "Document",
     "StringIterableReader",

diff --git a/llama_index/readers/txtai.py b/llama_index/readers/txtai.py
@@ -0,0 +1,77 @@
+"""txtai reader."""
+
+from typing import Any, Dict, List
+
+import numpy as np
+
+from llama_index.readers.base import BaseReader
+from llama_index.schema import Document
+
+
+class TxtaiReader(BaseReader):
+    """txtai reader.
+
+    Retrieves documents through an existing in-memory txtai index.
+    These documents can then be used in a downstream LlamaIndex data structure.
+    If you wish use txtai itself as an index to to organize documents,
+    insert documents, and perform queries on them, please use VectorStoreIndex
+    with TxtaiVectorStore.
+
+    Args:
+        txtai_index (txtai.ann.ANN): A txtai Index object (required)
+
+    """
+
+    def __init__(self, index: Any):
+        """Initialize with parameters."""
+        import_err_msg = """
+            `txtai` package not found. For instructions on
+            how to install `txtai` please visit
+            https://neuml.github.io/txtai/install/
+        """
+        try:
+            import txtai  # noqa
+        except ImportError:
+            raise ImportError(import_err_msg)
+
+        self._index = index
+
+    def load_data(
+        self,
+        query: np.ndarray,
+        id_to_text_map: Dict[str, str],
+        k: int = 4,
+        separate_documents: bool = True,
+    ) -> List[Document]:
+        """Load data from txtai index.
+
+        Args:
+            query (np.ndarray): A 2D numpy array of query vectors.
+            id_to_text_map (Dict[str, str]): A map from ID's to text.
+            k (int): Number of nearest neighbors to retrieve. Defaults to 4.
+            separate_documents (Optional[bool]): Whether to return separate
+                documents. Defaults to True.
+
+        Returns:
+            List[Document]: A list of documents.
+
+        """
+        search_result = self._index.search(query, k)
+        documents = []
+        for query_result in search_result:
+            for doc_id, _ in query_result:
+                doc_id = str(doc_id)
+                if doc_id not in id_to_text_map:
+                    raise ValueError(
+                        f"Document ID {doc_id} not found in id_to_text_map."
+                    )
+                text = id_to_text_map[doc_id]
+                documents.append(Document(text=text))
+
+        if not separate_documents:
+            # join all documents into one
+            text_list = [doc.get_content() for doc in documents]
+            text = "\n\n".join(text_list)
+            documents = [Document(text=text)]
+
+        return documents
diff --git a/llama_index/vector_stores/__init__.py b/llama_index/vector_stores/__init__.py
@@ -43,6 +43,7 @@
 from llama_index.vector_stores.tair import TairVectorStore
 from llama_index.vector_stores.tencentvectordb import TencentVectorDB
 from llama_index.vector_stores.timescalevector import TimescaleVectorStore
+from llama_index.vector_stores.txtai import TxtaiVectorStore
 from llama_index.vector_stores.types import (
     ExactMatchFilter,
     FilterCondition,
@@ -61,6 +62,7 @@
     "RedisVectorStore",
     "RocksetVectorStore",
     "FaissVectorStore",
+    "TxtaiVectorStore",
     "PineconeVectorStore",
     "WeaviateVectorStore",
     "QdrantVectorStore",