Skip to content

Add InMemory and HnswLib vector stores #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions langchain/vectorstores/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from langchain.vectorstores.deeplake import DeepLake
from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from langchain.vectorstores.faiss import FAISS
from langchain.vectorstores.hnsw_lib import HnswLib
from langchain.vectorstores.in_memory_exact_search import InMemoryExactSearch
from langchain.vectorstores.milvus import Milvus
from langchain.vectorstores.myscale import MyScale, MyScaleSettings
from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch
Expand Down Expand Up @@ -34,4 +36,6 @@
"MyScaleSettings",
"SupabaseVectorStore",
"AnalyticDB",
"HnswLib",
"InMemoryExactSearch",
]
141 changes: 141 additions & 0 deletions langchain/vectorstores/hnsw_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""Wrapper around HnswLib store."""
from __future__ import annotations

from typing import List, Optional, Type

from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VST
from langchain.vectorstores.vector_store_from_doc_index import (
VecStoreFromDocIndex,
_check_docarray_import,
)


class HnswLib(VecStoreFromDocIndex):
"""Wrapper around HnswLib storage.

To use it, you should have the ``docarray[hnswlib]`` package with version >=0.31.0 installed.
You can install it with `pip install "langchain[hnswlib]"`.
"""

def __init__(
self,
embedding: Embeddings,
work_dir: str,
n_dim: int,
dist_metric: str = "cosine",
max_elements: int = 1024,
index: bool = True,
ef_construction: int = 200,
ef: int = 10,
M: int = 16,
allow_replace_deleted: bool = True,
num_threads: int = 1,
) -> None:
"""Initialize HnswLib store.

Args:
embedding (Embeddings): Embedding function.
work_dir (str): path to the location where all the data will be stored.
n_dim (int): dimension of an embedding.
dist_metric (str): Distance metric for HnswLib can be one of: "cosine",
"ip", and "l2". Defaults to "cosine".
max_elements (int): Maximum number of vectors that can be stored.
Defaults to 1024.
index (bool): Whether an index should be built for this field.
Defaults to True.
ef_construction (int): defines a construction time/accuracy trade-off.
Defaults to 200.
ef (int): parameter controlling query time/accuracy trade-off.
Defaults to 10.
M (int): parameter that defines the maximum number of outgoing
connections in the graph. Defaults to 16.
allow_replace_deleted (bool): Enables replacing of deleted elements
with new added ones. Defaults to True.
num_threads (int): Sets the number of cpu threads to use. Defaults to 1.
"""
_check_docarray_import()
from docarray.index import HnswDocumentIndex

try:
import google.protobuf
except ImportError:
raise ImportError(
"Could not import all required packages. "
"Please install it with `pip install \"langchain[hnswlib]\"`."
)

doc_cls = self._get_doc_cls(
{
"dim": n_dim,
"space": dist_metric,
"max_elements": max_elements,
"index": index,
"ef_construction": ef_construction,
"ef": ef,
"M": M,
"allow_replace_deleted": allow_replace_deleted,
"num_threads": num_threads,
}
)
doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir)
super().__init__(doc_index, embedding)

@classmethod
def from_texts(
cls: Type[VST],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
work_dir: str = None,
n_dim: int = None,
dist_metric: str = "l2",
max_elements: int = 1024,
index: bool = True,
ef_construction: int = 200,
ef: int = 10,
M: int = 16,
allow_replace_deleted: bool = True,
num_threads: int = 1,
) -> HnswLib:
"""Create an HnswLib store and insert data.

Args:
texts (List[str]): Text data.
embedding (Embeddings): Embedding function.
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
Defaults to None.
work_dir (str): path to the location where all the data will be stored.
n_dim (int): dimension of an embedding.
dist_metric (str): Distance metric for HnswLib can be one of: "cosine",
"ip", and "l2". Defaults to "l2".
max_elements (int): Maximum number of vectors that can be stored.
Defaults to 1024.
index (bool): Whether an index should be built for this field.
Defaults to True.
ef_construction (int): defines a construction time/accuracy trade-off.
Defaults to 200.
ef (int): parameter controlling query time/accuracy trade-off.
Defaults to 10.
M (int): parameter that defines the maximum number of outgoing
connections in the graph. Defaults to 16.
allow_replace_deleted (bool): Enables replacing of deleted elements
with new added ones. Defaults to True.
num_threads (int): Sets the number of cpu threads to use. Defaults to 1.

Returns:
HnswLib Vector Store
"""
if work_dir is None:
raise ValueError("`work_dir` parameter hs not been set.")
if n_dim is None:
raise ValueError("`n_dim` parameter has not been set.")

store = cls(
work_dir=work_dir,
n_dim=n_dim,
embedding=embedding,
dist_metric=dist_metric,
)
store.add_texts(texts=texts, metadatas=metadatas)
return store
68 changes: 68 additions & 0 deletions langchain/vectorstores/in_memory_exact_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""Wrapper around in-memory storage."""
from __future__ import annotations

from typing import List, Optional, Type

from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VST
from langchain.vectorstores.vector_store_from_doc_index import (
VecStoreFromDocIndex,
_check_docarray_import,
)


class InMemoryExactSearch(VecStoreFromDocIndex):
"""Wrapper around in-memory storage for exact search.

To use it, you should have the ``docarray`` package with version >=0.31.0 installed.
You can install it with `pip install "langchain[in_memory_store]"`.
"""

def __init__(
self,
embedding: Embeddings,
metric: str = "cosine_sim",
) -> None:
"""Initialize InMemoryExactSearch store.

Args:
embedding (Embeddings): Embedding function.
metric (str): metric for exact nearest-neighbor search.
Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
Defaults to "cosine_sim".
"""
_check_docarray_import()
from docarray.index import InMemoryExactNNIndex

doc_cls = self._get_doc_cls({"space": metric})
doc_index = InMemoryExactNNIndex[doc_cls]()
super().__init__(doc_index, embedding)

@classmethod
def from_texts(
cls: Type[VST],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
metric: str = "cosine_sim",
) -> InMemoryExactSearch:
"""Create an InMemoryExactSearch store and insert data.

Args:
texts (List[str]): Text data.
embedding (Embeddings): Embedding function.
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
Defaults to None.
metric (str): metric for exact nearest-neighbor search.
Can be one of: "cosine_sim", "euclidean_dist" and "sqeuclidean_dist".
Defaults to "cosine_sim".

Returns:
InMemoryExactSearch Vector Store
"""
store = cls(
embedding=embedding,
metric=metric,
)
store.add_texts(texts=texts, metadatas=metadatas)
return store
Loading