lfoppiano commited on
Commit
ae04b9d
1 Parent(s): 0b28b48

fix import, and reformat

Browse files
document_qa/document_qa_engine.py CHANGED
@@ -12,7 +12,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
  from langchain.vectorstores import Chroma
13
  from tqdm import tqdm
14
 
15
- from grobid_processors import GrobidProcessor
16
 
17
 
18
  class DocumentQAEngine:
 
12
  from langchain.vectorstores import Chroma
13
  from tqdm import tqdm
14
 
15
+ from document_qa.grobid_processors import GrobidProcessor
16
 
17
 
18
  class DocumentQAEngine:
document_qa/grobid_processors.py CHANGED
@@ -413,7 +413,8 @@ class GrobidMaterialsProcessor(BaseProcessor):
413
 
414
  def extract_materials(self, text):
415
  preprocessed_text = text.strip()
416
- status, result = self.grobid_superconductors_client.process_text(preprocessed_text, "processText_disable_linking")
 
417
 
418
  if status != 200:
419
  result = {}
@@ -679,6 +680,7 @@ class XmlProcessor(BaseProcessor):
679
 
680
  return output_data
681
 
 
682
  def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
683
  children = []
684
 
@@ -697,6 +699,7 @@ def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
697
 
698
  return children
699
 
 
700
  def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
701
  children = []
702
 
@@ -739,4 +742,4 @@ def get_children_figures(soup: object, use_paragraphs: object = True, verbose: o
739
  if verbose:
740
  print(str(children))
741
 
742
- return children
 
413
 
414
  def extract_materials(self, text):
415
  preprocessed_text = text.strip()
416
+ status, result = self.grobid_superconductors_client.process_text(preprocessed_text,
417
+ "processText_disable_linking")
418
 
419
  if status != 200:
420
  result = {}
 
680
 
681
  return output_data
682
 
683
+
684
  def get_children_list_supermat(soup, use_paragraphs=False, verbose=False):
685
  children = []
686
 
 
699
 
700
  return children
701
 
702
+
703
  def get_children_list_grobid(soup: object, use_paragraphs: object = True, verbose: object = False) -> object:
704
  children = []
705
 
 
742
  if verbose:
743
  print(str(children))
744
 
745
+ return children
streamlit_app.py CHANGED
@@ -42,6 +42,7 @@ if 'git_rev' not in st.session_state:
42
  if "messages" not in st.session_state:
43
  st.session_state.messages = []
44
 
 
45
  def new_file():
46
  st.session_state['loaded_embeddings'] = None
47
  st.session_state['doc_id'] = None
@@ -69,6 +70,7 @@ def init_qa(model):
69
 
70
  return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])
71
 
 
72
  @st.cache_resource
73
  def init_ner():
74
  quantities_client = QuantitiesAPI(os.environ['GROBID_QUANTITIES_URL'], check_server=True)
@@ -89,14 +91,16 @@ def init_ner():
89
  materials_client.set_config(config_materials)
90
 
91
  gqa = GrobidAggregationProcessor(None,
92
- grobid_quantities_client=quantities_client,
93
- grobid_superconductors_client=materials_client
94
- )
95
 
96
  return gqa
97
 
 
98
  gqa = init_ner()
99
 
 
100
  def get_file_hash(fname):
101
  hash_md5 = blake2b()
102
  with open(fname, "rb") as f:
@@ -122,7 +126,7 @@ def play_old_messages():
122
  is_api_key_provided = st.session_state['api_key']
123
 
124
  model = st.sidebar.radio("Model (cannot be changed after selection or upload)",
125
- ("chatgpt-3.5-turbo", "mistral-7b-instruct-v0.1"),#, "llama-2-70b-chat"),
126
  index=1,
127
  captions=[
128
  "ChatGPT 3.5 Turbo + Ada-002-text (embeddings)",
@@ -134,13 +138,15 @@ model = st.sidebar.radio("Model (cannot be changed after selection or upload)",
134
 
135
  if not st.session_state['api_key']:
136
  if model == 'mistral-7b-instruct-v0.1' or model == 'llama-2-70b-chat':
137
- api_key = st.sidebar.text_input('Huggingface API Key', type="password")# if 'HUGGINGFACEHUB_API_TOKEN' not in os.environ else os.environ['HUGGINGFACEHUB_API_TOKEN']
 
138
  if api_key:
139
  st.session_state['api_key'] = is_api_key_provided = True
140
  os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
141
  st.session_state['rqa'] = init_qa(model)
142
  elif model == 'chatgpt-3.5-turbo':
143
- api_key = st.sidebar.text_input('OpenAI API Key', type="password") #if 'OPENAI_API_KEY' not in os.environ else os.environ['OPENAI_API_KEY']
 
144
  if api_key:
145
  st.session_state['api_key'] = is_api_key_provided = True
146
  os.environ['OPENAI_API_KEY'] = api_key
@@ -177,10 +183,12 @@ with st.sidebar:
177
  st.markdown(
178
  """After entering your API Key (Open AI or Huggingface). Upload a scientific article as PDF document. You will see a spinner or loading indicator while the processing is in progress. Once the spinner stops, you can proceed to ask your questions.""")
179
 
180
- st.markdown('**NER on LLM responses**: The responses from the LLMs are post-processed to extract <span style="color:orange">physical quantities, measurements</span> and <span style="color:green">materials</span> mentions.', unsafe_allow_html=True)
 
 
181
  if st.session_state['git_rev'] != "unknown":
182
  st.markdown("**Revision number**: [" + st.session_state[
183
- 'git_rev'] + "](https://github.com/lfoppiano/document-qa/commit/" + st.session_state['git_rev'] + ")")
184
 
185
  st.header("Query mode (Advanced use)")
186
  st.markdown(
@@ -219,11 +227,11 @@ if st.session_state.loaded_embeddings and question and len(question) > 0 and st.
219
  if mode == "Embeddings":
220
  with st.spinner("Generating LLM response..."):
221
  text_response = st.session_state['rqa'].query_storage(question, st.session_state.doc_id,
222
- context_size=context_size)
223
  elif mode == "LLM":
224
  with st.spinner("Generating response..."):
225
  _, text_response = st.session_state['rqa'].query_document(question, st.session_state.doc_id,
226
- context_size=context_size)
227
 
228
  if not text_response:
229
  st.error("Something went wrong. Contact Luca Foppiano ([email protected]) to report the issue.")
 
42
  if "messages" not in st.session_state:
43
  st.session_state.messages = []
44
 
45
+
46
  def new_file():
47
  st.session_state['loaded_embeddings'] = None
48
  st.session_state['doc_id'] = None
 
70
 
71
  return DocumentQAEngine(chat, embeddings, grobid_url=os.environ['GROBID_URL'])
72
 
73
+
74
  @st.cache_resource
75
  def init_ner():
76
  quantities_client = QuantitiesAPI(os.environ['GROBID_QUANTITIES_URL'], check_server=True)
 
91
  materials_client.set_config(config_materials)
92
 
93
  gqa = GrobidAggregationProcessor(None,
94
+ grobid_quantities_client=quantities_client,
95
+ grobid_superconductors_client=materials_client
96
+ )
97
 
98
  return gqa
99
 
100
+
101
  gqa = init_ner()
102
 
103
+
104
  def get_file_hash(fname):
105
  hash_md5 = blake2b()
106
  with open(fname, "rb") as f:
 
126
  is_api_key_provided = st.session_state['api_key']
127
 
128
  model = st.sidebar.radio("Model (cannot be changed after selection or upload)",
129
+ ("chatgpt-3.5-turbo", "mistral-7b-instruct-v0.1"), # , "llama-2-70b-chat"),
130
  index=1,
131
  captions=[
132
  "ChatGPT 3.5 Turbo + Ada-002-text (embeddings)",
 
138
 
139
  if not st.session_state['api_key']:
140
  if model == 'mistral-7b-instruct-v0.1' or model == 'llama-2-70b-chat':
141
+ api_key = st.sidebar.text_input('Huggingface API Key',
142
+ type="password") # if 'HUGGINGFACEHUB_API_TOKEN' not in os.environ else os.environ['HUGGINGFACEHUB_API_TOKEN']
143
  if api_key:
144
  st.session_state['api_key'] = is_api_key_provided = True
145
  os.environ["HUGGINGFACEHUB_API_TOKEN"] = api_key
146
  st.session_state['rqa'] = init_qa(model)
147
  elif model == 'chatgpt-3.5-turbo':
148
+ api_key = st.sidebar.text_input('OpenAI API Key',
149
+ type="password") # if 'OPENAI_API_KEY' not in os.environ else os.environ['OPENAI_API_KEY']
150
  if api_key:
151
  st.session_state['api_key'] = is_api_key_provided = True
152
  os.environ['OPENAI_API_KEY'] = api_key
 
183
  st.markdown(
184
  """After entering your API Key (Open AI or Huggingface). Upload a scientific article as PDF document. You will see a spinner or loading indicator while the processing is in progress. Once the spinner stops, you can proceed to ask your questions.""")
185
 
186
+ st.markdown(
187
+ '**NER on LLM responses**: The responses from the LLMs are post-processed to extract <span style="color:orange">physical quantities, measurements</span> and <span style="color:green">materials</span> mentions.',
188
+ unsafe_allow_html=True)
189
  if st.session_state['git_rev'] != "unknown":
190
  st.markdown("**Revision number**: [" + st.session_state[
191
+ 'git_rev'] + "](https://github.com/lfoppiano/document-qa/commit/" + st.session_state['git_rev'] + ")")
192
 
193
  st.header("Query mode (Advanced use)")
194
  st.markdown(
 
227
  if mode == "Embeddings":
228
  with st.spinner("Generating LLM response..."):
229
  text_response = st.session_state['rqa'].query_storage(question, st.session_state.doc_id,
230
+ context_size=context_size)
231
  elif mode == "LLM":
232
  with st.spinner("Generating response..."):
233
  _, text_response = st.session_state['rqa'].query_document(question, st.session_state.doc_id,
234
+ context_size=context_size)
235
 
236
  if not text_response:
237
  st.error("Something went wrong. Contact Luca Foppiano ([email protected]) to report the issue.")