Spaces:

wing-nus
/

SciAssist

Sleeping

kirinzhu commited on May 22, 2023

Commit

6b1a980

•

1 Parent(s): 827202a

Upload dataset_extraction.py (#11)

- Upload dataset_extraction.py (3037a7c96c513edd7a68b8e32bdd57bdad5a64d4)

Co-authored-by: Linxiao Zhu <[email protected]>

Files changed (1) hide show

dataset_extraction.py ADDED Viewed

+from typing import List, Tuple
+import torch
+import nltk
+from SciAssist import DatasetExtraction
+device = "gpu" if torch.cuda.is_available() else "cpu"
+de_pipeline = DatasetExtraction(os_name="nt")
+def de_for_str(input):
+    list_input = nltk.sent_tokenize(input)
+    results = de_pipeline.extract(list_input, type="str", save_results=False)
+    # output = []
+    # for res in results["dataset_mentions"]:
+    #     output.append(f"{res}\n\n")
+    # return "".join(output)
+    output = []
+    for mention_pair in results["dataset_mentions"]:
+        output.append((mention_pair[0], mention_pair[1]))
+        output.append(("\n\n", None))
+    return output
+def de_for_file(input):
+    if input == None:
+        return None
+    filename = input.name
+    # Identify the format of input and parse reference strings
+    if filename[-4:] == ".txt":
+        results = de_pipeline.extract(filename, type="txt", save_results=False)
+    elif filename[-4:] == ".pdf":
+        results = de_pipeline.extract(filename, type="pdf", save_results=False)
+    else:
+        return [("File Format Error !", None)]
+    output = []
+    for mention_pair in results["dataset_mentions"]:
+        output.append((mention_pair[0], mention_pair[1]))
+        output.append(("\n\n", None))
+    return output
+de_str_example = "BAKIS incorporates information derived from the bank balance sheets and supervisory reports of all German banks ."