Spaces:
Sleeping
Sleeping
update dependencies, remove biblio from search space
Browse files- document_qa/document_qa_engine.py +4 -5
- document_qa/grobid_processors.py +9 -9
- requirements.txt +8 -5
- streamlit_app.py +20 -11
document_qa/document_qa_engine.py
CHANGED
@@ -7,7 +7,6 @@ import tiktoken
|
|
7 |
from langchain.chains import create_extraction_chain
|
8 |
from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
|
9 |
map_rerank_prompt
|
10 |
-
from langchain.evaluation import PairwiseEmbeddingDistanceEvalChain, load_evaluator, EmbeddingDistance
|
11 |
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
|
12 |
from langchain.retrievers import MultiQueryRetriever
|
13 |
from langchain.schema import Document
|
@@ -273,7 +272,7 @@ class DocumentQAEngine:
|
|
273 |
"""
|
274 |
db = self.data_storage.embeddings_dict[doc_id]
|
275 |
retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
|
276 |
-
relevant_documents = retriever.
|
277 |
|
278 |
return relevant_documents
|
279 |
|
@@ -284,7 +283,7 @@ class DocumentQAEngine:
|
|
284 |
# search_type="similarity_score_threshold"
|
285 |
# )
|
286 |
retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
|
287 |
-
relevant_documents = retriever.
|
288 |
relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
|
289 |
for doc in
|
290 |
relevant_documents]
|
@@ -338,7 +337,7 @@ class DocumentQAEngine:
|
|
338 |
def _get_context(self, doc_id, query, context_size=4) -> (List[Document], list):
|
339 |
db = self.data_storage.embeddings_dict[doc_id]
|
340 |
retriever = db.as_retriever(search_kwargs={"k": context_size})
|
341 |
-
relevant_documents = retriever.
|
342 |
relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
|
343 |
for doc in
|
344 |
relevant_documents]
|
@@ -361,7 +360,7 @@ class DocumentQAEngine:
|
|
361 |
def _get_context_multiquery(self, doc_id, query, context_size=4):
|
362 |
db = self.data_storage.embeddings_dict[doc_id].as_retriever(search_kwargs={"k": context_size})
|
363 |
multi_query_retriever = MultiQueryRetriever.from_llm(retriever=db, llm=self.llm)
|
364 |
-
relevant_documents = multi_query_retriever.
|
365 |
return relevant_documents
|
366 |
|
367 |
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
|
|
|
7 |
from langchain.chains import create_extraction_chain
|
8 |
from langchain.chains.question_answering import load_qa_chain, stuff_prompt, refine_prompts, map_reduce_prompt, \
|
9 |
map_rerank_prompt
|
|
|
10 |
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate
|
11 |
from langchain.retrievers import MultiQueryRetriever
|
12 |
from langchain.schema import Document
|
|
|
272 |
"""
|
273 |
db = self.data_storage.embeddings_dict[doc_id]
|
274 |
retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
|
275 |
+
relevant_documents = retriever.invoke(query)
|
276 |
|
277 |
return relevant_documents
|
278 |
|
|
|
283 |
# search_type="similarity_score_threshold"
|
284 |
# )
|
285 |
retriever = db.as_retriever(search_kwargs={"k": context_size}, search_type="similarity_with_embeddings")
|
286 |
+
relevant_documents = retriever.invoke(query)
|
287 |
relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
|
288 |
for doc in
|
289 |
relevant_documents]
|
|
|
337 |
def _get_context(self, doc_id, query, context_size=4) -> (List[Document], list):
|
338 |
db = self.data_storage.embeddings_dict[doc_id]
|
339 |
retriever = db.as_retriever(search_kwargs={"k": context_size})
|
340 |
+
relevant_documents = retriever.invoke(query)
|
341 |
relevant_document_coordinates = [doc.metadata['coordinates'].split(";") if 'coordinates' in doc.metadata else []
|
342 |
for doc in
|
343 |
relevant_documents]
|
|
|
360 |
def _get_context_multiquery(self, doc_id, query, context_size=4):
|
361 |
db = self.data_storage.embeddings_dict[doc_id].as_retriever(search_kwargs={"k": context_size})
|
362 |
multi_query_retriever = MultiQueryRetriever.from_llm(retriever=db, llm=self.llm)
|
363 |
+
relevant_documents = multi_query_retriever.invoke(query)
|
364 |
return relevant_documents
|
365 |
|
366 |
def get_text_from_document(self, pdf_file_path, chunk_size=-1, perc_overlap=0.1, verbose=False):
|
document_qa/grobid_processors.py
CHANGED
@@ -148,15 +148,15 @@ class GrobidProcessor(BaseProcessor):
|
|
148 |
soup = BeautifulSoup(text, 'xml')
|
149 |
blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
|
150 |
|
151 |
-
passages.append({
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
})
|
160 |
|
161 |
passages.append({
|
162 |
"text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
|
|
|
148 |
soup = BeautifulSoup(text, 'xml')
|
149 |
blocks_header = get_xml_nodes_header(soup, use_paragraphs=True)
|
150 |
|
151 |
+
# passages.append({
|
152 |
+
# "text": f"authors: {biblio['authors']}",
|
153 |
+
# "type": passage_type,
|
154 |
+
# "section": "<header>",
|
155 |
+
# "subSection": "<authors>",
|
156 |
+
# "passage_id": "hauthors",
|
157 |
+
# "coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
158 |
+
# blocks_header['authors']])
|
159 |
+
# })
|
160 |
|
161 |
passages.append({
|
162 |
"text": self.post_process(" ".join([node.text for node in blocks_header['title']])),
|
requirements.txt
CHANGED
@@ -16,14 +16,17 @@ dateparser
|
|
16 |
|
17 |
# LLM
|
18 |
chromadb==0.4.24
|
19 |
-
tiktoken==0.
|
20 |
-
openai==1.
|
21 |
-
langchain==0.
|
22 |
-
langchain-core==0.
|
|
|
|
|
|
|
23 |
typing-inspect==0.9.0
|
24 |
typing_extensions==4.11.0
|
25 |
pydantic==2.6.4
|
26 |
sentence_transformers==2.6.1
|
27 |
-
streamlit-pdf-viewer==0.0.
|
28 |
umap-learn
|
29 |
plotly
|
|
|
16 |
|
17 |
# LLM
|
18 |
chromadb==0.4.24
|
19 |
+
tiktoken==0.7.0
|
20 |
+
openai==1.42.0
|
21 |
+
langchain==0.2.14
|
22 |
+
langchain-core==0.2.34
|
23 |
+
langchain-openai==0.1.22
|
24 |
+
langchain-huggingface==0.0.3
|
25 |
+
langchain-community==0.2.12
|
26 |
typing-inspect==0.9.0
|
27 |
typing_extensions==4.11.0
|
28 |
pydantic==2.6.4
|
29 |
sentence_transformers==2.6.1
|
30 |
+
streamlit-pdf-viewer==0.0.18-dev1
|
31 |
umap-learn
|
32 |
plotly
|
streamlit_app.py
CHANGED
@@ -6,10 +6,11 @@ from tempfile import NamedTemporaryFile
|
|
6 |
import dotenv
|
7 |
from grobid_quantities.quantities import QuantitiesAPI
|
8 |
from langchain.memory import ConversationBufferWindowMemory
|
9 |
-
from langchain_community.
|
10 |
-
from langchain_community.
|
11 |
-
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
12 |
from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
|
|
|
|
|
13 |
from streamlit_pdf_viewer import pdf_viewer
|
14 |
|
15 |
from document_qa.ner_client_generic import NERClientGeneric
|
@@ -97,6 +98,9 @@ if 'pdf' not in st.session_state:
|
|
97 |
if 'embeddings' not in st.session_state:
|
98 |
st.session_state['embeddings'] = None
|
99 |
|
|
|
|
|
|
|
100 |
st.set_page_config(
|
101 |
page_title="Scientific Document Insights Q/A",
|
102 |
page_icon="π",
|
@@ -169,7 +173,8 @@ def init_qa(model, embeddings_name=None, api_key=None):
|
|
169 |
repo_id=OPEN_MODELS[model],
|
170 |
temperature=0.01,
|
171 |
max_new_tokens=4092,
|
172 |
-
model_kwargs={"max_length": 8192}
|
|
|
173 |
)
|
174 |
embeddings = HuggingFaceEmbeddings(
|
175 |
model_name=OPEN_EMBEDDINGS[embeddings_name])
|
@@ -233,8 +238,8 @@ def play_old_messages(container):
|
|
233 |
# is_api_key_provided = st.session_state['api_key']
|
234 |
|
235 |
with st.sidebar:
|
236 |
-
st.title("π
|
237 |
-
st.
|
238 |
st.markdown(
|
239 |
":warning: [Usage disclaimer](https://github.com/lfoppiano/document-qa?tab=readme-ov-file#disclaimer-on-data-security-and-privacy-%EF%B8%8F) :warning: ")
|
240 |
|
@@ -301,14 +306,14 @@ with st.sidebar:
|
|
301 |
# help="Clear the conversational memory. Currently implemented to retrain the 4 most recent messages.",
|
302 |
# disabled=model in st.session_state['rqa'] and st.session_state['rqa'][model].memory is None)
|
303 |
|
304 |
-
left_column, right_column = st.columns([
|
305 |
right_column = right_column.container(border=True)
|
306 |
left_column = left_column.container(border=True)
|
307 |
|
308 |
with right_column:
|
309 |
uploaded_file = st.file_uploader(
|
310 |
-
"Upload
|
311 |
-
type=("pdf"
|
312 |
on_change=new_file,
|
313 |
disabled=st.session_state['model'] is not None and st.session_state['model'] not in
|
314 |
st.session_state['api_keys'],
|
@@ -343,6 +348,10 @@ with st.sidebar:
|
|
343 |
"relevant paragraphs to the question in the paper. "
|
344 |
"Question coefficient attempt to estimate how effective the question will be answered."
|
345 |
)
|
|
|
|
|
|
|
|
|
346 |
st.session_state['ner_processing'] = st.checkbox(
|
347 |
"Identify materials and properties.",
|
348 |
help='The LLM responses undergo post-processing to extract physical quantities, measurements, and materials mentions.'
|
@@ -415,7 +424,6 @@ def generate_color_gradient(num_elements):
|
|
415 |
|
416 |
with right_column:
|
417 |
if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
|
418 |
-
# messages.chat_message("user").markdown(question)
|
419 |
st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
|
420 |
|
421 |
for message in st.session_state.messages:
|
@@ -491,5 +499,6 @@ with left_column:
|
|
491 |
input=st.session_state['binary'],
|
492 |
annotation_outline_size=2,
|
493 |
annotations=st.session_state['annotations'],
|
494 |
-
render_text=True
|
|
|
495 |
)
|
|
|
6 |
import dotenv
|
7 |
from grobid_quantities.quantities import QuantitiesAPI
|
8 |
from langchain.memory import ConversationBufferWindowMemory
|
9 |
+
from langchain_community.callbacks import PromptLayerCallbackHandler
|
10 |
+
from langchain_community.chat_models import ChatOpenAI
|
|
|
11 |
from langchain_community.llms.huggingface_endpoint import HuggingFaceEndpoint
|
12 |
+
from langchain_huggingface import HuggingFaceEmbeddings
|
13 |
+
from langchain_openai import OpenAIEmbeddings
|
14 |
from streamlit_pdf_viewer import pdf_viewer
|
15 |
|
16 |
from document_qa.ner_client_generic import NERClientGeneric
|
|
|
98 |
if 'embeddings' not in st.session_state:
|
99 |
st.session_state['embeddings'] = None
|
100 |
|
101 |
+
if 'scroll_to_first_annotation' not in st.session_state:
|
102 |
+
st.session_state['scroll_to_first_annotation'] = False
|
103 |
+
|
104 |
st.set_page_config(
|
105 |
page_title="Scientific Document Insights Q/A",
|
106 |
page_icon="π",
|
|
|
173 |
repo_id=OPEN_MODELS[model],
|
174 |
temperature=0.01,
|
175 |
max_new_tokens=4092,
|
176 |
+
model_kwargs={"max_length": 8192},
|
177 |
+
callbacks=[PromptLayerCallbackHandler(pl_tags=[model, "document-qa"])]
|
178 |
)
|
179 |
embeddings = HuggingFaceEmbeddings(
|
180 |
model_name=OPEN_EMBEDDINGS[embeddings_name])
|
|
|
238 |
# is_api_key_provided = st.session_state['api_key']
|
239 |
|
240 |
with st.sidebar:
|
241 |
+
st.title("π Document Q/A")
|
242 |
+
st.markdown("Upload a scientific article in PDF, ask questions, get insights.")
|
243 |
st.markdown(
|
244 |
":warning: [Usage disclaimer](https://github.com/lfoppiano/document-qa?tab=readme-ov-file#disclaimer-on-data-security-and-privacy-%EF%B8%8F) :warning: ")
|
245 |
|
|
|
306 |
# help="Clear the conversational memory. Currently implemented to retrain the 4 most recent messages.",
|
307 |
# disabled=model in st.session_state['rqa'] and st.session_state['rqa'][model].memory is None)
|
308 |
|
309 |
+
left_column, right_column = st.columns([5, 4])
|
310 |
right_column = right_column.container(border=True)
|
311 |
left_column = left_column.container(border=True)
|
312 |
|
313 |
with right_column:
|
314 |
uploaded_file = st.file_uploader(
|
315 |
+
"Upload a scientific article",
|
316 |
+
type=("pdf"),
|
317 |
on_change=new_file,
|
318 |
disabled=st.session_state['model'] is not None and st.session_state['model'] not in
|
319 |
st.session_state['api_keys'],
|
|
|
348 |
"relevant paragraphs to the question in the paper. "
|
349 |
"Question coefficient attempt to estimate how effective the question will be answered."
|
350 |
)
|
351 |
+
st.session_state['scroll_to_first_annotation'] = st.checkbox(
|
352 |
+
"Scroll to context",
|
353 |
+
help='The PDF viewer will automatically scroll to the first relevant passage in the document.'
|
354 |
+
)
|
355 |
st.session_state['ner_processing'] = st.checkbox(
|
356 |
"Identify materials and properties.",
|
357 |
help='The LLM responses undergo post-processing to extract physical quantities, measurements, and materials mentions.'
|
|
|
424 |
|
425 |
with right_column:
|
426 |
if st.session_state.loaded_embeddings and question and len(question) > 0 and st.session_state.doc_id:
|
|
|
427 |
st.session_state.messages.append({"role": "user", "mode": mode, "content": question})
|
428 |
|
429 |
for message in st.session_state.messages:
|
|
|
499 |
input=st.session_state['binary'],
|
500 |
annotation_outline_size=2,
|
501 |
annotations=st.session_state['annotations'],
|
502 |
+
render_text=True,
|
503 |
+
scroll_to_annotation=1 if (st.session_state['annotations'] and st.session_state['scroll_to_first_annotation']) else None
|
504 |
)
|