Spaces:
Sleeping
Sleeping
from typing import List, Tuple | |
import torch | |
from SciAssist import Summarization | |
import os | |
import requests | |
from datasets import load_dataset | |
print(f"Is CUDA available: {torch.cuda.is_available()}") | |
# True | |
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}") | |
acl_data = load_dataset("dyxohjl666/CocoScisum_ACL", revision="refs/convert/parquet") | |
device = "gpu" if torch.cuda.is_available() else "cpu" | |
ctrlsum_pipeline = Summarization(os_name="nt",checkpoint="dyxohjl666/flant5-xl-cocoscisum",device=device) | |
acl_dict = {} | |
recommended_kw = {} | |
def convert_to_dict(data): | |
""" Dict: | |
{ url: | |
{length: | |
{keywords: summary}; | |
raw_text: | |
str; | |
} | |
} | |
""" | |
url = data["url"] | |
text = data["text"] | |
keywords = data["keywords"] | |
length = data["length"] | |
summary = data["summary"] | |
for u, t, k, l, s in zip(url, text, keywords, length, summary): | |
if len(u) < 5: | |
continue | |
u = u + ".pdf" | |
if k == None: | |
k = "" | |
if l == None: | |
l = "" | |
k = str(k).strip() | |
l = str(l).strip() | |
if u in acl_dict.keys(): | |
if k in acl_dict[u][l].keys(): | |
continue | |
else: | |
acl_dict[u][l][k] = s | |
else: | |
acl_dict[u] = {"": {}, "50": {}, "100": {}, "200": {}, "raw_text": t} | |
# kws | |
if u in recommended_kw.keys(): | |
if k == "" or k in recommended_kw[u]: | |
continue | |
else: | |
recommended_kw[u].append(k) | |
else: | |
recommended_kw[u] = [] | |
return 1 | |
for i in acl_data.keys(): | |
signal = convert_to_dict(acl_data[i]) | |
def download_pdf(url, dest_folder): | |
""" | |
Download a PDF from a given URL and save it to a specified destination folder. | |
Parameters: | |
url (str): URL of the PDF | |
dest_folder (str): Destination folder to save the downloaded PDF | |
""" | |
if not os.path.exists(dest_folder): | |
os.makedirs(dest_folder) | |
response = requests.get(url, stream=True) | |
filename = os.path.join(dest_folder, url.split("/")[-1]) | |
with open(filename, 'wb') as file: | |
for chunk in response.iter_content(chunk_size=1024): | |
if chunk: | |
file.write(chunk) | |
print(f"Downloaded {url} to {filename}") | |
return filename | |
def ctrlsum_for_str(input, length=None, keywords=None) -> List[Tuple[str, str]]: | |
if keywords is not None: | |
keywords = keywords.strip().split(",") | |
if keywords[0] == "": | |
keywords = None | |
if length == 0 or length is None: | |
length = None | |
results = ctrlsum_pipeline.predict(input, type="str", | |
length=length, keywords=keywords, num_beams=1) | |
output = [] | |
for res in results["summary"]: | |
output.append(f"{res}\n\n") | |
return "".join(output) | |
def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]: | |
if input == None and url == "": | |
if text == "": | |
return None, "Input cannot be left blank.", None | |
else: | |
return ctrlsum_for_str(text, length, keywords), text, None | |
else: | |
filename = "" | |
url = url.strip() | |
if url != "": | |
if len(url) > 4 and url[-3:] == "pdf": | |
if url.strip() in acl_dict.keys(): | |
raw_text = acl_dict[url]["raw_text"] | |
l = str(length) | |
if length == 0: | |
l = "" | |
if l in acl_dict[url].keys(): | |
if keywords.strip() in acl_dict[url][l].keys(): | |
summary = acl_dict[url][l][keywords] | |
return summary, raw_text, None | |
if keywords.strip() == "": | |
keywords = None | |
if l == "": | |
l = None | |
return ctrlsum_for_str(raw_text, l, keywords), raw_text, None | |
filename = download_pdf(url, './cache/') | |
else: | |
"Invalid url(Not PDF)!", None, None | |
else: | |
filename = input.name | |
if keywords != "": | |
keywords = keywords.strip().split(",") | |
if keywords[0] == "": | |
keywords = None | |
if length == 0: | |
length = None | |
# Identify the format of input and parse reference strings | |
if filename[-4:] == ".txt": | |
results = ctrlsum_pipeline.predict(filename, type="txt", | |
save_results=False, | |
length=length, keywords=keywords, num_beams=1) | |
elif filename[-4:] == ".pdf": | |
results = ctrlsum_pipeline.predict(filename, | |
save_results=False, length=length, keywords=keywords, num_beams=1) | |
else: | |
return "File Format Error !", None, filename | |
output = [] | |
for res in results["summary"]: | |
output.append(f"{res}\n\n") | |
return "".join(output), results["raw_text"], filename | |
ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . " | |