Spaces:

wing-nus
/

SciAssist

Sleeping

App Files Files Community

SciAssist / controlled_summarization.py

wing-nus

fix bug

fcc0878 11 months ago

raw

history blame contribute delete

9.4 kB

	from typing import List, Tuple
	import torch
	from SciAssist import Summarization
	import os
	import requests
	from datasets import load_dataset

	print(f"Is CUDA available: {torch.cuda.is_available()}")
	# True
	if torch.cuda.is_available():
	print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
	device = 'gpu'
	ctrlsum_pipeline = Summarization(os_name="nt",model_name="flan-t5-xl",checkpoint="dyxohjl666/flant5-xl-cocoscisum",device=device)
	else:
	device = 'cpu'
	ctrlsum_pipeline = Summarization(os_name="nt",device=device)


	acl_dict = {}
	recommended_kw = {}
	acl_data = load_dataset("dyxohjl666/CocoScisum_ACL", revision="refs/convert/parquet")


	def convert_to_dict(data):
	""" Dict:
	{ url:
	{length:
	{keywords: summary};
	raw_text:
	str;
	}
	}

	"""
	url = data["url"]
	text = data["text"]
	keywords = data["keywords"]
	length = data["length"]
	summary = data["summary"]
	for u, t, k, l, s in zip(url, text, keywords, length, summary):
	if len(u) < 5:
	continue
	u = u + ".pdf"
	if k == None:
	k = ""
	if l == None:
	l = ""
	k = str(k).strip()
	l = str(l).strip()
	if u in acl_dict.keys():
	if k in acl_dict[u][l].keys():
	continue
	else:
	acl_dict[u][l][k] = s
	else:
	acl_dict[u] = {"": {}, "50": {}, "100": {}, "200": {}, "raw_text": t}

	# kws
	if u in recommended_kw.keys():
	if k == "" or k in recommended_kw[u]:
	continue
	else:
	recommended_kw[u].append(k)
	else:
	recommended_kw[u] = []
	return 1


	for i in acl_data.keys():
	signal = convert_to_dict(acl_data[i])


	def download_pdf(url, dest_folder):
	"""
	Download a PDF from a given URL and save it to a specified destination folder.
	Parameters:
	url (str): URL of the PDF
	dest_folder (str): Destination folder to save the downloaded PDF
	"""

	if not os.path.exists(dest_folder):
	os.makedirs(dest_folder)

	response = requests.get(url, stream=True)
	filename = os.path.join(dest_folder, url.split("/")[-1])

	with open(filename, 'wb') as file:
	for chunk in response.iter_content(chunk_size=1024):
	if chunk:
	file.write(chunk)
	print(f"Downloaded {url} to {filename}")
	return filename


	def ctrlsum_for_str(input, length=None, keywords=None) -> List[Tuple[str, str]]:
	if keywords is not None:
	keywords = keywords.strip().split(",")
	if keywords[0] == "":
	keywords = None
	if length == 0 or length is None:
	length = None
	results = ctrlsum_pipeline.predict(input, type="str",
	length=length, keywords=keywords, num_beams=1)

	output = []
	for res in results["summary"]:
	output.append(f"{res}\n\n")
	return "".join(output)


	def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]:
	if input == None and url == "":
	if text == "":
	return None, "Input cannot be left blank.", None
	else:
	return ctrlsum_for_str(text, length, keywords), text, None
	else:
	filename = ""
	url = url.strip()
	if url != "":
	if len(url) > 4 and url[-3:] == "pdf":
	if url.strip() in acl_dict.keys():
	raw_text = acl_dict[url]["raw_text"]
	l = str(length)
	if length == 0:
	l = ""
	if l in acl_dict[url].keys():
	if keywords.strip() in acl_dict[url][l].keys():
	summary = acl_dict[url][l][keywords]
	return summary, raw_text, None
	if keywords.strip() == "":
	keywords = None
	if l == "":
	l = None
	return ctrlsum_for_str(raw_text, int(l), keywords), raw_text, None

	filename = download_pdf(url, './cache/')
	else:
	"Invalid url(Not PDF)!", None, None
	else:
	filename = input.name
	if keywords != "":
	keywords = keywords.strip().split(",")
	if keywords[0] == "":
	keywords = None
	if length == 0:
	length = None
	# Identify the format of input and parse reference strings
	if filename[-4:] == ".txt":
	results = ctrlsum_pipeline.predict(filename, type="txt",
	save_results=False,
	length=length, keywords=keywords, num_beams=1)
	elif filename[-4:] == ".pdf":
	results = ctrlsum_pipeline.predict(filename,
	save_results=False, length=length, keywords=keywords, num_beams=1)
	else:
	return "File Format Error !", None, filename

	output = []
	for res in results["summary"]:
	output.append(f"{res}\n\n")
	return "".join(output), results["raw_text"], filename


	ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "