defget_relevant_documents(
self,
query: str,
*,
callbacks: Callbacks = None,
tags: Optional[List[str]] = None,
metadata: Optional[Dict[str, Any]] = None,
run_name: Optional[str] = None,
**kwargs: Any,
) -> List[Document]:
"""Retrieve documents relevant to a query.
Users should favor using `.invoke` or `.batch` rather than
`get_relevant_documents directly`.
Args:
query: string to find relevant documents for
callbacks: Callback manager or list of callbacks
tags: Optional list of tags associated with the retriever. Defaults to None
These tags will be associated with each call to this retriever,
and passed as arguments to the handlers defined in `callbacks`.
metadata: Optional metadata associated with the retriever. Defaults to None
This metadata will be associated with each call to this retriever,
and passed as arguments to the handlers defined in `callbacks`.
run_name: Optional name for the run.
Returns:
List of relevant documents
"""
langchain_core.callbacks.manager CallbackManager
callback_manager = CallbackManager.configure(
callbacks,
,
verbose=kwargs.get(, ),
inheritable_tags=tags,
local_tags=.tags,
inheritable_metadata=metadata,
local_metadata=.metadata,
)
run_manager = callback_manager.on_retriever_start(
dumpd(),
query,
name=run_name,
run_id=kwargs.pop(, ),
)
:
_kwargs = kwargs ._expects_other_args {}
._new_arg_supported:
result = ._get_relevant_documents(
query, run_manager=run_manager, **_kwargs
)
:
result = ._get_relevant_documents(query, **_kwargs)
Exception e:
run_manager.on_retriever_error(e)
e
:
run_manager.on_retriever_end(
result,
)
result
classVectorStoreRetriever(BaseRetriever):
"""Base Retriever class for VectorStore."""
vectorstore: VectorStore
"""VectorStore to use for retrieval."""
search_type: str = "similarity""""Type of search to perform. Defaults to "similarity"."""
search_kwargs: dict = Field(default_factory=dict)
"""Keyword arguments to pass to the search function."""
allowed_search_types: ClassVar[Collection[str]] = (
"similarity",
"similarity_score_threshold",
"mmr",
)
defas_retriever(self, **kwargs: Any) -> VectorStoreRetriever:
"""Return VectorStoreRetriever initialized from this VectorStore.
Args:
search_type (Optional[str]): Defines the type of search that
the Retriever should perform.
Can be "similarity" (default), "mmr", or
"similarity_score_threshold".
search_kwargs (Optional[Dict]): Keyword arguments to pass to the
search function. Can include things like:
k: Amount of documents to return (Default: 4)
score_threshold: Minimum relevance threshold
for similarity_score_threshold
fetch_k: Amount of documents to pass to MMR algorithm (Default: 20)
lambda_mult: Diversity of results returned by MMR;
1 for minimum diversity and 0 for maximum. (Default: 0.5)
filter: Filter by document metadata
Returns:
VectorStoreRetriever: Retriever class for retrieval.
"""
tags = kwargs.pop("tags", None) or []
tags.extend(self._get_retriever_tags())
return VectorStoreRetriever(vectorstore=self, **kwargs, tags=tags)
defsimilarity_search(
self,
query: str,
k: int = 4,
filter: Optional[Union[Callable, Dict[str, Any]]] = None,
fetch_k: int = 20,
**kwargs: Any,
) -> List[Document]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
Defaults to 20.
Returns:
List of Documents most similar to the query text.
"""
docs_and_scores = self.similarity_search_with_score(
query, k, filter=filter, fetch_k=fetch_k, **kwargs
)
return [doc for doc, _ in docs_and_scores]
这里可以看到他是调用了 similarity_search_with_score() 方法,然后把结果中的 score 给略去了,这里不得不吐槽这个调用是不是脱裤子放屁,明明可以写在一个方法里面,传入一个 flag 标识是否要返回分数就可以解决,非要封装成两个方法。吐槽结束继续查看 similarity_search_with_score() 方法。
defsimilarity_search_with_score(
self,
query: str,
k: int = 4,
filter: Optional[Union[Callable, Dict[str, Any]]] = None,
fetch_k: int = 20,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata.
Defaults to None. If a callable, it must take as input the
metadata dict of Document and return a bool.
fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
Defaults to 20.
Returns:
List of documents most similar to the query text with
L2 distance in float. Lower score represents more similarity.
"""
embedding = self._embed_query(query)
docs = self.similarity_search_with_score_by_vector(
embedding,
k,
filter=filter,
fetch_k=fetch_k,
**kwargs,
)
return docs
defsimilarity_search_with_score_by_vector(
self,
embedding: List[float],
k: int = 4,
filter: Optional[Union[Callable, Dict[str, Any]]] = None,
fetch_k: int = 20,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs most similar to query.
Args:
embedding: Embedding vector to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
filter (Optional[Union[Callable, Dict[str, Any]]]): Filter by metadata.
Defaults to None. If a callable, it must take as input the
metadata dict of Document and return a bool.
fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
Defaults to 20.
**kwargs: kwargs to be passed to similarity search. Can include:
score_threshold: Optional, a floating point value between 0 to 1 to
filter the resulting set of retrieved docs
Returns:
List of documents most similar to the query text and L2 distance
in float for each. Lower score represents more similarity.
"""
faiss = dependable_faiss_import()
vector = np.array([embedding], dtype=np.float32)
ifself._normalize_L2:
faiss.normalize_L2(vector)
scores, indices = self.index.search(vector, k iffilterisNoneelse fetch_k)
docs = []
iffilterisnotNone:
filter_func = self._create_filter_func(filter)
for j, i inenumerate(indices[0]):
if i == -1:
# This happens when not enough docs are returned.continue
_id = self.index_to_docstore_id[i]
doc = self.docstore.search(_id)
ifnotisinstance(doc, Document):
raise ValueError(f"Could not find document for id {_id}, got {doc}")
iffilterisnotNone:
if filter_func(doc.metadata):
docs.append((doc, scores[0][j]))
else:
docs.append((doc, scores[0][j]))
score_threshold = kwargs.get("score_threshold")
if score_threshold isnotNone:
cmp = (
operator.ge
ifself.distance_strategy
in (DistanceStrategy.MAX_INNER_PRODUCT, DistanceStrategy.JACCARD)
else operator.le
)
docs = [
(doc, similarity)
for doc, similarity in docs
if cmp(similarity, score_threshold)
]
return docs[:k]
defsimilarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return docs and relevance scores in the range [0, 1].
0 is dissimilar, 1 is most similar.
Args:
query: input text
k: Number of Documents to return. Defaults to 4.
**kwargs: kwargs to be passed to similarity search. Should include:
score_threshold: Optional, a floating point value between 0 to 1 to
filter the resulting set of retrieved docs
Returns:
List of Tuples of (doc, similarity_score)
"""
score_threshold = kwargs.pop("score_threshold", None)
docs_and_similarities = self._similarity_search_with_relevance_scores(
query, k=k, **kwargs
)
ifany(
similarity < 0.0or similarity > 1.0for _, similarity in docs_and_similarities
):
warnings.warn(
"Relevance scores must be between"f" 0 and 1, got {docs_and_similarities}"
)
if score_threshold isnotNone:
docs_and_similarities = [
(doc, similarity)
for doc, similarity in docs_and_similarities
if similarity >= score_threshold
]
iflen(docs_and_similarities) == 0:
warnings.warn(
"No relevant docs were retrieved using the relevance score"f" threshold {score_threshold}"
)
return docs_and_similarities
def_similarity_search_with_relevance_scores(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""
Default similarity search with relevance scores. Modify if necessary
in subclass.
Return docs and relevance scores in the range [0, 1].
0 is dissimilar, 1 is most similar.
Args:
query: input text
k: Number of Documents to return. Defaults to 4.
**kwargs: kwargs to be passed to similarity search. Should include:
score_threshold: Optional, a floating point value between 0 to 1 to
filter the resulting set of retrieved docs
Returns:
List of Tuples of (doc, similarity_score)
"""
relevance_score_fn = self._select_relevance_score_fn()
docs_and_scores = self.similarity_search_with_score(query, k, **kwargs)
return [(doc, relevance_score_fn(score)) for doc, score in docs_and_scores]
def_select_relevance_score_fn(self) -> Callable[[float], float]:
"""
The 'correct' relevance function
may differ depending on a few things, including:
- the distance / similarity metric used by the VectorStore
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
- embedding dimensionality
- etc.
Vectorstores should define their own selection based method of relevance.
"""raise NotImplementedError
def_select_relevance_score_fn(self) -> Callable[[float], float]:
"""
The 'correct' relevance function
may differ depending on a few things, including:
- the distance / similarity metric used by the VectorStore
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
- embedding dimensionality
- etc.
"""ifself.override_relevance_score_fn isnotNone:
returnself.override_relevance_score_fn
# Default strategy is to rely on distance strategy provided in# vectorstore constructorifself.distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
returnself._max_inner_product_relevance_score_fn
elifself.distance_strategy == DistanceStrategy.EUCLIDEAN_DISTANCE:
# Default behavior is to use euclidean distance relevancyreturnself._euclidean_relevance_score_fn
elifself.distance_strategy == DistanceStrategy.COSINE:
returnself._cosine_relevance_score_fn
else:
raise ValueError(
"Unknown distance strategy, must be cosine, max_inner_product,"" or euclidean"
)
@staticmethoddef_max_inner_product_relevance_score_fn(distance: float) -> float:
"""Normalize the distance to a score on a scale [0, 1]."""if distance > 0:
return1.0 - distance
return -1.0 * distance
@staticmethoddef_euclidean_relevance_score_fn(distance: float) -> float:
"""Return a similarity score on a scale [0, 1]."""# The 'correct' relevance function# may differ depending on a few things, including:# - the distance / similarity metric used by the VectorStore# - the scale of your embeddings (OpenAI's are unit normed. Many# others are not!)# - embedding dimensionality# - etc.# This function converts the euclidean norm of normalized embeddings# (0 is most similar, sqrt(2) most dissimilar)# to a similarity function (0 to 1)return1.0 - distance / math.sqrt(2)
@staticmethoddef_cosine_relevance_score_fn(distance: float) -> float:
"""Normalize the distance to a score on a scale [0, 1]."""return1.0 - distance