Spaces:
Sleeping
Sleeping
add query analyzer with min and avg similarity
Browse files- document_qa/document_qa_engine.py +36 -156
- document_qa/langchain.py +141 -0
- requirements.txt +3 -1
document_qa/document_qa_engine.py
CHANGED
@@ -1,35 +1,23 @@
|
|
1 |
import copy
|
2 |
import os
|
3 |
from pathlib import Path
|
4 |
-
from typing import Union, Any,
|
5 |
|
6 |
import tiktoken
|
7 |
from langchain.chains import create_extraction_chain
|
8 |
from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
|
9 |
map_rerank_prompt
|
|
|
10 |
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
|
11 |
from langchain.retrievers import MultiQueryRetriever
|
12 |
from langchain.schema import Document
|
13 |
-
from langchain_community.vectorstores.chroma import Chroma
|
14 |
-
from
|
15 |
-
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
16 |
-
from langchain_core.utils import xor_args
|
17 |
-
from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
|
18 |
from tqdm import tqdm
|
19 |
|
|
|
20 |
from document_qa.grobid_processors import GrobidProcessor
|
21 |
-
|
22 |
-
|
23 |
-
def _results_to_docs_scores_and_embeddings(results: Any) -> List[Tuple[Document, float, List[float]]]:
|
24 |
-
return [
|
25 |
-
(Document(page_content=result[0], metadata=result[1] or {}), result[2], result[3])
|
26 |
-
for result in zip(
|
27 |
-
results["documents"][0],
|
28 |
-
results["metadatas"][0],
|
29 |
-
results["distances"][0],
|
30 |
-
results["embeddings"][0],
|
31 |
-
)
|
32 |
-
]
|
33 |
|
34 |
|
35 |
class TextMerger:
|
@@ -117,135 +105,6 @@ class BaseRetrieval:
|
|
117 |
self.persist_directory = persist_directory
|
118 |
|
119 |
|
120 |
-
class AdvancedVectorStoreRetriever(VectorStoreRetriever):
|
121 |
-
allowed_search_types: ClassVar[Collection[str]] = (
|
122 |
-
"similarity",
|
123 |
-
"similarity_score_threshold",
|
124 |
-
"mmr",
|
125 |
-
"similarity_with_embeddings"
|
126 |
-
)
|
127 |
-
|
128 |
-
def _get_relevant_documents(
|
129 |
-
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
130 |
-
) -> List[Document]:
|
131 |
-
if self.search_type == "similarity":
|
132 |
-
docs = self.vectorstore.similarity_search(query, **self.search_kwargs)
|
133 |
-
elif self.search_type == "similarity_score_threshold":
|
134 |
-
docs_and_similarities = (
|
135 |
-
self.vectorstore.similarity_search_with_relevance_scores(
|
136 |
-
query, **self.search_kwargs
|
137 |
-
)
|
138 |
-
)
|
139 |
-
for doc, similarity in docs_and_similarities:
|
140 |
-
if '__similarity' not in doc.metadata.keys():
|
141 |
-
doc.metadata['__similarity'] = similarity
|
142 |
-
|
143 |
-
docs = [doc for doc, _ in docs_and_similarities]
|
144 |
-
elif self.search_type == "mmr":
|
145 |
-
docs = self.vectorstore.max_marginal_relevance_search(
|
146 |
-
query, **self.search_kwargs
|
147 |
-
)
|
148 |
-
elif self.search_type == "similarity_with_embeddings":
|
149 |
-
docs_scores_and_embeddings = (
|
150 |
-
self.vectorstore.advanced_similarity_search(
|
151 |
-
query, **self.search_kwargs
|
152 |
-
)
|
153 |
-
)
|
154 |
-
|
155 |
-
for doc, score, embeddings in docs_scores_and_embeddings:
|
156 |
-
if '__embeddings' not in doc.metadata.keys():
|
157 |
-
doc.metadata['__embeddings'] = embeddings
|
158 |
-
if '__similarity' not in doc.metadata.keys():
|
159 |
-
doc.metadata['__similarity'] = score
|
160 |
-
|
161 |
-
docs = [doc for doc, _, _ in docs_scores_and_embeddings]
|
162 |
-
else:
|
163 |
-
raise ValueError(f"search_type of {self.search_type} not allowed.")
|
164 |
-
return docs
|
165 |
-
|
166 |
-
|
167 |
-
class AdvancedVectorStore(VectorStore):
|
168 |
-
def as_retriever(self, **kwargs: Any) -> AdvancedVectorStoreRetriever:
|
169 |
-
tags = kwargs.pop("tags", None) or []
|
170 |
-
tags.extend(self._get_retriever_tags())
|
171 |
-
return AdvancedVectorStoreRetriever(vectorstore=self, **kwargs, tags=tags)
|
172 |
-
|
173 |
-
|
174 |
-
class ChromaAdvancedRetrieval(Chroma, AdvancedVectorStore):
|
175 |
-
def __init__(self, **kwargs):
|
176 |
-
super().__init__(**kwargs)
|
177 |
-
|
178 |
-
@xor_args(("query_texts", "query_embeddings"))
|
179 |
-
def __query_collection(
|
180 |
-
self,
|
181 |
-
query_texts: Optional[List[str]] = None,
|
182 |
-
query_embeddings: Optional[List[List[float]]] = None,
|
183 |
-
n_results: int = 4,
|
184 |
-
where: Optional[Dict[str, str]] = None,
|
185 |
-
where_document: Optional[Dict[str, str]] = None,
|
186 |
-
**kwargs: Any,
|
187 |
-
) -> List[Document]:
|
188 |
-
"""Query the chroma collection."""
|
189 |
-
try:
|
190 |
-
import chromadb # noqa: F401
|
191 |
-
except ImportError:
|
192 |
-
raise ValueError(
|
193 |
-
"Could not import chromadb python package. "
|
194 |
-
"Please install it with `pip install chromadb`."
|
195 |
-
)
|
196 |
-
return self._collection.query(
|
197 |
-
query_texts=query_texts,
|
198 |
-
query_embeddings=query_embeddings,
|
199 |
-
n_results=n_results,
|
200 |
-
where=where,
|
201 |
-
where_document=where_document,
|
202 |
-
**kwargs,
|
203 |
-
)
|
204 |
-
|
205 |
-
def advanced_similarity_search(
|
206 |
-
self,
|
207 |
-
query: str,
|
208 |
-
k: int = DEFAULT_K,
|
209 |
-
filter: Optional[Dict[str, str]] = None,
|
210 |
-
**kwargs: Any,
|
211 |
-
) -> [List[Document], float, List[float]]:
|
212 |
-
docs_scores_and_embeddings = self.similarity_search_with_scores_and_embeddings(query, k, filter=filter)
|
213 |
-
return docs_scores_and_embeddings
|
214 |
-
|
215 |
-
def similarity_search_with_scores_and_embeddings(
|
216 |
-
self,
|
217 |
-
query: str,
|
218 |
-
k: int = DEFAULT_K,
|
219 |
-
filter: Optional[Dict[str, str]] = None,
|
220 |
-
where_document: Optional[Dict[str, str]] = None,
|
221 |
-
**kwargs: Any,
|
222 |
-
) -> List[Tuple[Document, float, List[float]]]:
|
223 |
-
|
224 |
-
if self._embedding_function is None:
|
225 |
-
results = self.__query_collection(
|
226 |
-
query_texts=[query],
|
227 |
-
n_results=k,
|
228 |
-
where=filter,
|
229 |
-
where_document=where_document,
|
230 |
-
include=['metadatas', 'documents', 'embeddings', 'distances']
|
231 |
-
)
|
232 |
-
else:
|
233 |
-
query_embedding = self._embedding_function.embed_query(query)
|
234 |
-
results = self.__query_collection(
|
235 |
-
query_embeddings=[query_embedding],
|
236 |
-
n_results=k,
|
237 |
-
where=filter,
|
238 |
-
where_document=where_document,
|
239 |
-
include=['metadatas', 'documents', 'embeddings', 'distances']
|
240 |
-
)
|
241 |
-
|
242 |
-
return _results_to_docs_scores_and_embeddings(results)
|
243 |
-
|
244 |
-
|
245 |
-
class FAISSAdvancedRetrieval(FAISS):
|
246 |
-
pass
|
247 |
-
|
248 |
-
|
249 |
class NER_Retrival(VectorStore):
|
250 |
"""
|
251 |
This class implement a retrieval based on NER models.
|
@@ -256,7 +115,6 @@ class NER_Retrival(VectorStore):
|
|
256 |
|
257 |
engines = {
|
258 |
'chroma': ChromaAdvancedRetrieval,
|
259 |
-
'faiss': FAISSAdvancedRetrieval,
|
260 |
'ner': NER_Retrival
|
261 |
}
|
262 |
|
@@ -409,7 +267,7 @@ class DocumentQAEngine:
|
|
409 |
context_as_text = [doc.page_content for doc in documents]
|
410 |
return context_as_text, coordinates
|
411 |
|
412 |
-
def query_storage_and_embeddings(self, query: str, doc_id, context_size=4):
|
413 |
"""
|
414 |
Returns both the context and the embedding information from a given query
|
415 |
"""
|
@@ -417,10 +275,35 @@ class DocumentQAEngine:
|
|
417 |
retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
|
418 |
relevant_documents = retriever.get_relevant_documents(query)
|
419 |
|
420 |
-
|
421 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
422 |
|
423 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
424 |
|
425 |
def _parse_json(self, response, output_parser):
|
426 |
system_message = "You are an useful assistant expert in materials science, physics, and chemistry " \
|
@@ -444,10 +327,7 @@ class DocumentQAEngine:
|
|
444 |
return parsed_output
|
445 |
|
446 |
def _run_query(self, doc_id, query, context_size=4) -> (List[Document], list):
|
447 |
-
relevant_documents = self._get_context(doc_id, query, context_size)
|
448 |
-
relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
|
449 |
-
for doc in
|
450 |
-
relevant_documents]
|
451 |
response = self.chain.run(input_documents=relevant_documents,
|
452 |
question=query)
|
453 |
|
|
|
1 |
import copy
|
2 |
import os
|
3 |
from pathlib import Path
|
4 |
+
from typing import Union, Any, List
|
5 |
|
6 |
import tiktoken
|
7 |
from langchain.chains import create_extraction_chain
|
8 |
from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
|
9 |
map_rerank_prompt
|
10 |
+
from langchain.evaluation import PairwiseEmbeddingDistanceEvalChain, load_evaluator, EmbeddingDistance
|
11 |
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
|
12 |
from langchain.retrievers import MultiQueryRetriever
|
13 |
from langchain.schema import Document
|
14 |
+
from langchain_community.vectorstores.chroma import Chroma
|
15 |
+
from langchain_core.vectorstores import VectorStore
|
|
|
|
|
|
|
16 |
from tqdm import tqdm
|
17 |
|
18 |
+
# from document_qa.embedding_visualiser import QueryVisualiser
|
19 |
from document_qa.grobid_processors import GrobidProcessor
|
20 |
+
from document_qa.langchain import ChromaAdvancedRetrieval
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
|
23 |
class TextMerger:
|
|
|
105 |
self.persist_directory = persist_directory
|
106 |
|
107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
class NER_Retrival(VectorStore):
|
109 |
"""
|
110 |
This class implement a retrieval based on NER models.
|
|
|
115 |
|
116 |
engines = {
|
117 |
'chroma': ChromaAdvancedRetrieval,
|
|
|
118 |
'ner': NER_Retrival
|
119 |
}
|
120 |
|
|
|
267 |
context_as_text = [doc.page_content for doc in documents]
|
268 |
return context_as_text, coordinates
|
269 |
|
270 |
+
def query_storage_and_embeddings(self, query: str, doc_id, context_size=4) -> List[Document]:
|
271 |
"""
|
272 |
Returns both the context and the embedding information from a given query
|
273 |
"""
|
|
|
275 |
retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
|
276 |
relevant_documents = retriever.get_relevant_documents(query)
|
277 |
|
278 |
+
return relevant_documents
|
279 |
+
|
280 |
+
def analyse_query(self, query, doc_id, context_size=4):
|
281 |
+
db = self.data_storage.embeddings_dict[doc_id]
|
282 |
+
# retriever = db.as_retriever(
|
283 |
+
# search_kwargs={"k": context_size, 'score_threshold': 0.0},
|
284 |
+
# search_type="similarity_score_threshold"
|
285 |
+
# )
|
286 |
+
retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
|
287 |
+
relevant_documents = retriever.get_relevant_documents(query)
|
288 |
+
relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
|
289 |
+
for doc in
|
290 |
+
relevant_documents]
|
291 |
+
all_documents = db.get(include=['documents', 'metadatas', 'embeddings'])
|
292 |
+
# all_documents_embeddings = all_documents["embeddings"]
|
293 |
+
# query_embedding = db._embedding_function.embed_query(query)
|
294 |
+
|
295 |
+
# distance_evaluator = load_evaluator("pairwise_embedding_distance",
|
296 |
+
# embeddings=db._embedding_function,
|
297 |
+
# distance_metric=EmbeddingDistance.EUCLIDEAN)
|
298 |
|
299 |
+
# distance_evaluator.evaluate_string_pairs(query=query_embedding, documents="")
|
300 |
+
|
301 |
+
similarities = [doc.metadata['__similarity'] for doc in relevant_documents]
|
302 |
+
min_similarity = min(similarities)
|
303 |
+
mean_similarity = sum(similarities) / len(similarities)
|
304 |
+
coefficient = min_similarity - mean_similarity
|
305 |
+
|
306 |
+
return f"Coefficient: {coefficient}, (Min similarity {min_similarity}, Mean similarity: {mean_similarity})", relevant_document_coordinates
|
307 |
|
308 |
def _parse_json(self, response, output_parser):
|
309 |
system_message = "You are an useful assistant expert in materials science, physics, and chemistry " \
|
|
|
327 |
return parsed_output
|
328 |
|
329 |
def _run_query(self, doc_id, query, context_size=4) -> (List[Document], list):
|
330 |
+
relevant_documents, relevant_document_coordinates = self._get_context(doc_id, query, context_size)
|
|
|
|
|
|
|
331 |
response = self.chain.run(input_documents=relevant_documents,
|
332 |
question=query)
|
333 |
|
document_qa/langchain.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from typing import Any, Optional, List, Dict, Tuple, ClassVar, Collection
|
3 |
+
|
4 |
+
from langchain.schema import Document
|
5 |
+
from langchain_community.vectorstores.chroma import Chroma, DEFAULT_K
|
6 |
+
from langchain_core.callbacks import CallbackManagerForRetrieverRun
|
7 |
+
from langchain_core.utils import xor_args
|
8 |
+
from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
|
9 |
+
|
10 |
+
|
11 |
+
class AdvancedVectorStoreRetriever(VectorStoreRetriever):
|
12 |
+
allowed_search_types: ClassVar[Collection[str]] = (
|
13 |
+
"similarity",
|
14 |
+
"similarity_score_threshold",
|
15 |
+
"mmr",
|
16 |
+
"similarity_with_embeddings"
|
17 |
+
)
|
18 |
+
|
19 |
+
def _get_relevant_documents(
|
20 |
+
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
21 |
+
) -> List[Document]:
|
22 |
+
|
23 |
+
if self.search_type == "similarity_with_embeddings":
|
24 |
+
docs_scores_and_embeddings = (
|
25 |
+
self.vectorstore.advanced_similarity_search(
|
26 |
+
query, **self.search_kwargs
|
27 |
+
)
|
28 |
+
)
|
29 |
+
|
30 |
+
for doc, score, embeddings in docs_scores_and_embeddings:
|
31 |
+
if '__embeddings' not in doc.metadata.keys():
|
32 |
+
doc.metadata['__embeddings'] = embeddings
|
33 |
+
if '__similarity' not in doc.metadata.keys():
|
34 |
+
doc.metadata['__similarity'] = score
|
35 |
+
|
36 |
+
docs = [doc for doc, _, _ in docs_scores_and_embeddings]
|
37 |
+
elif self.search_type == "similarity_score_threshold":
|
38 |
+
docs_and_similarities = (
|
39 |
+
self.vectorstore.similarity_search_with_relevance_scores(
|
40 |
+
query, **self.search_kwargs
|
41 |
+
)
|
42 |
+
)
|
43 |
+
for doc, similarity in docs_and_similarities:
|
44 |
+
if '__similarity' not in doc.metadata.keys():
|
45 |
+
doc.metadata['__similarity'] = similarity
|
46 |
+
|
47 |
+
docs = [doc for doc, _ in docs_and_similarities]
|
48 |
+
else:
|
49 |
+
docs = super()._get_relevant_documents(query, run_manager=run_manager)
|
50 |
+
|
51 |
+
return docs
|
52 |
+
|
53 |
+
|
54 |
+
class AdvancedVectorStore(VectorStore):
|
55 |
+
def as_retriever(self, **kwargs: Any) -> AdvancedVectorStoreRetriever:
|
56 |
+
tags = kwargs.pop("tags", None) or []
|
57 |
+
tags.extend(self._get_retriever_tags())
|
58 |
+
return AdvancedVectorStoreRetriever(vectorstore=self, **kwargs, tags=tags)
|
59 |
+
|
60 |
+
|
61 |
+
class ChromaAdvancedRetrieval(Chroma, AdvancedVectorStore):
|
62 |
+
def __init__(self, **kwargs):
|
63 |
+
super().__init__(**kwargs)
|
64 |
+
|
65 |
+
@xor_args(("query_texts", "query_embeddings"))
|
66 |
+
def __query_collection(
|
67 |
+
self,
|
68 |
+
query_texts: Optional[List[str]] = None,
|
69 |
+
query_embeddings: Optional[List[List[float]]] = None,
|
70 |
+
n_results: int = 4,
|
71 |
+
where: Optional[Dict[str, str]] = None,
|
72 |
+
where_document: Optional[Dict[str, str]] = None,
|
73 |
+
**kwargs: Any,
|
74 |
+
) -> List[Document]:
|
75 |
+
"""Query the chroma collection."""
|
76 |
+
try:
|
77 |
+
import chromadb # noqa: F401
|
78 |
+
except ImportError:
|
79 |
+
raise ValueError(
|
80 |
+
"Could not import chromadb python package. "
|
81 |
+
"Please install it with `pip install chromadb`."
|
82 |
+
)
|
83 |
+
return self._collection.query(
|
84 |
+
query_texts=query_texts,
|
85 |
+
query_embeddings=query_embeddings,
|
86 |
+
n_results=n_results,
|
87 |
+
where=where,
|
88 |
+
where_document=where_document,
|
89 |
+
**kwargs,
|
90 |
+
)
|
91 |
+
|
92 |
+
def advanced_similarity_search(
|
93 |
+
self,
|
94 |
+
query: str,
|
95 |
+
k: int = DEFAULT_K,
|
96 |
+
filter: Optional[Dict[str, str]] = None,
|
97 |
+
**kwargs: Any,
|
98 |
+
) -> [List[Document], float, List[float]]:
|
99 |
+
docs_scores_and_embeddings = self.similarity_search_with_scores_and_embeddings(query, k, filter=filter)
|
100 |
+
return docs_scores_and_embeddings
|
101 |
+
|
102 |
+
def similarity_search_with_scores_and_embeddings(
|
103 |
+
self,
|
104 |
+
query: str,
|
105 |
+
k: int = DEFAULT_K,
|
106 |
+
filter: Optional[Dict[str, str]] = None,
|
107 |
+
where_document: Optional[Dict[str, str]] = None,
|
108 |
+
**kwargs: Any,
|
109 |
+
) -> List[Tuple[Document, float, List[float]]]:
|
110 |
+
|
111 |
+
if self._embedding_function is None:
|
112 |
+
results = self.__query_collection(
|
113 |
+
query_texts=[query],
|
114 |
+
n_results=k,
|
115 |
+
where=filter,
|
116 |
+
where_document=where_document,
|
117 |
+
include=['metadatas', 'documents', 'embeddings', 'distances']
|
118 |
+
)
|
119 |
+
else:
|
120 |
+
query_embedding = self._embedding_function.embed_query(query)
|
121 |
+
results = self.__query_collection(
|
122 |
+
query_embeddings=[query_embedding],
|
123 |
+
n_results=k,
|
124 |
+
where=filter,
|
125 |
+
where_document=where_document,
|
126 |
+
include=['metadatas', 'documents', 'embeddings', 'distances']
|
127 |
+
)
|
128 |
+
|
129 |
+
return _results_to_docs_scores_and_embeddings(results)
|
130 |
+
|
131 |
+
|
132 |
+
def _results_to_docs_scores_and_embeddings(results: Any) -> List[Tuple[Document, float, List[float]]]:
|
133 |
+
return [
|
134 |
+
(Document(page_content=result[0], metadata=result[1] or {}), result[2], result[3])
|
135 |
+
for result in zip(
|
136 |
+
results["documents"][0],
|
137 |
+
results["metadatas"][0],
|
138 |
+
results["distances"][0],
|
139 |
+
results["embeddings"][0],
|
140 |
+
)
|
141 |
+
]
|
requirements.txt
CHANGED
@@ -24,4 +24,6 @@ typing-inspect==0.9.0
|
|
24 |
typing_extensions==4.11.0
|
25 |
pydantic==2.6.4
|
26 |
sentence_transformers==2.6.1
|
27 |
-
streamlit-pdf-viewer
|
|
|
|
|
|
24 |
typing_extensions==4.11.0
|
25 |
pydantic==2.6.4
|
26 |
sentence_transformers==2.6.1
|
27 |
+
streamlit-pdf-viewer
|
28 |
+
umap-learn
|
29 |
+
plotly
|