Spaces:
Running
Running
decouple quantities and superconductors
Browse files- document_qa/grobid_processors.py +17 -53
document_qa/grobid_processors.py
CHANGED
@@ -7,7 +7,6 @@ import dateparser
|
|
7 |
import grobid_tei_xml
|
8 |
from bs4 import BeautifulSoup
|
9 |
from grobid_client.grobid_client import GrobidClient
|
10 |
-
from tqdm import tqdm
|
11 |
|
12 |
|
13 |
def get_span_start(type, title=None):
|
@@ -55,49 +54,6 @@ def decorate_text_with_annotations(text, spans, tag="span"):
|
|
55 |
return annotated_text
|
56 |
|
57 |
|
58 |
-
def extract_quantities(client, x_all, column_text_index):
|
59 |
-
# relevant_items = ['magnetic field strength', 'magnetic induction', 'maximum energy product',
|
60 |
-
# "magnetic flux density", "magnetic flux"]
|
61 |
-
# property_keywords = ['coercivity', 'remanence']
|
62 |
-
|
63 |
-
output_data = []
|
64 |
-
|
65 |
-
for idx, example in tqdm(enumerate(x_all), desc="extract quantities"):
|
66 |
-
text = example[column_text_index]
|
67 |
-
spans = GrobidQuantitiesProcessor(client).extract_quantities(text)
|
68 |
-
|
69 |
-
data_record = {
|
70 |
-
"id": example[0],
|
71 |
-
"filename": example[1],
|
72 |
-
"passage_id": example[2],
|
73 |
-
"text": text,
|
74 |
-
"spans": spans
|
75 |
-
}
|
76 |
-
|
77 |
-
output_data.append(data_record)
|
78 |
-
|
79 |
-
return output_data
|
80 |
-
|
81 |
-
|
82 |
-
def extract_materials(client, x_all, column_text_index):
|
83 |
-
output_data = []
|
84 |
-
|
85 |
-
for idx, example in tqdm(enumerate(x_all), desc="extract materials"):
|
86 |
-
text = example[column_text_index]
|
87 |
-
spans = GrobidMaterialsProcessor(client).extract_materials(text)
|
88 |
-
data_record = {
|
89 |
-
"id": example[0],
|
90 |
-
"filename": example[1],
|
91 |
-
"passage_id": example[2],
|
92 |
-
"text": text,
|
93 |
-
"spans": spans
|
94 |
-
}
|
95 |
-
|
96 |
-
output_data.append(data_record)
|
97 |
-
|
98 |
-
return output_data
|
99 |
-
|
100 |
-
|
101 |
def get_parsed_value_type(quantity):
|
102 |
if 'parsedValue' in quantity and 'structure' in quantity['parsedValue']:
|
103 |
return quantity['parsedValue']['structure']['type']
|
@@ -199,7 +155,7 @@ class GrobidProcessor(BaseProcessor):
|
|
199 |
"subSection": "<title>",
|
200 |
"passage_id": "htitle",
|
201 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
202 |
-
|
203 |
})
|
204 |
|
205 |
passages.append({
|
@@ -302,7 +258,7 @@ class GrobidQuantitiesProcessor(BaseProcessor):
|
|
302 |
def __init__(self, grobid_quantities_client):
|
303 |
self.grobid_quantities_client = grobid_quantities_client
|
304 |
|
305 |
-
def extract_quantities(self, text):
|
306 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
307 |
|
308 |
if status != 200:
|
@@ -570,11 +526,12 @@ class GrobidMaterialsProcessor(BaseProcessor):
|
|
570 |
return materials
|
571 |
|
572 |
|
573 |
-
class GrobidAggregationProcessor(
|
574 |
-
def __init__(self,
|
575 |
-
|
576 |
-
|
577 |
-
|
|
|
578 |
|
579 |
def process_single_text(self, text):
|
580 |
extracted_quantities_spans = self.gqp.extract_quantities(text)
|
@@ -584,10 +541,17 @@ class GrobidAggregationProcessor(GrobidProcessor, GrobidQuantitiesProcessor, Gro
|
|
584 |
return entities
|
585 |
|
586 |
def extract_quantities(self, text):
|
587 |
-
|
|
|
|
|
|
|
|
|
588 |
|
589 |
def extract_materials(self, text):
|
590 |
-
|
|
|
|
|
|
|
591 |
|
592 |
@staticmethod
|
593 |
def box_to_dict(box, color=None, type=None):
|
|
|
7 |
import grobid_tei_xml
|
8 |
from bs4 import BeautifulSoup
|
9 |
from grobid_client.grobid_client import GrobidClient
|
|
|
10 |
|
11 |
|
12 |
def get_span_start(type, title=None):
|
|
|
54 |
return annotated_text
|
55 |
|
56 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
def get_parsed_value_type(quantity):
|
58 |
if 'parsedValue' in quantity and 'structure' in quantity['parsedValue']:
|
59 |
return quantity['parsedValue']['structure']['type']
|
|
|
155 |
"subSection": "<title>",
|
156 |
"passage_id": "htitle",
|
157 |
"coordinates": ";".join([node['coords'] if coordinates and node.has_attr('coords') else "" for node in
|
158 |
+
blocks_header['authors']])
|
159 |
})
|
160 |
|
161 |
passages.append({
|
|
|
258 |
def __init__(self, grobid_quantities_client):
|
259 |
self.grobid_quantities_client = grobid_quantities_client
|
260 |
|
261 |
+
def extract_quantities(self, text) -> list:
|
262 |
status, result = self.grobid_quantities_client.process_text(text.strip())
|
263 |
|
264 |
if status != 200:
|
|
|
526 |
return materials
|
527 |
|
528 |
|
529 |
+
class GrobidAggregationProcessor(GrobidQuantitiesProcessor, GrobidMaterialsProcessor):
|
530 |
+
def __init__(self, grobid_quantities_client=None, grobid_superconductors_client=None):
|
531 |
+
if grobid_quantities_client:
|
532 |
+
self.gqp = GrobidQuantitiesProcessor(grobid_quantities_client)
|
533 |
+
if grobid_superconductors_client:
|
534 |
+
self.gmp = GrobidMaterialsProcessor(grobid_superconductors_client)
|
535 |
|
536 |
def process_single_text(self, text):
|
537 |
extracted_quantities_spans = self.gqp.extract_quantities(text)
|
|
|
541 |
return entities
|
542 |
|
543 |
def extract_quantities(self, text):
|
544 |
+
if self.gqp:
|
545 |
+
return self.gqp.extract_quantities(text)
|
546 |
+
else:
|
547 |
+
return []
|
548 |
+
|
549 |
|
550 |
def extract_materials(self, text):
|
551 |
+
if self.gmp:
|
552 |
+
return self.gmp.extract_materials(text)
|
553 |
+
else:
|
554 |
+
return []
|
555 |
|
556 |
@staticmethod
|
557 |
def box_to_dict(box, color=None, type=None):
|