File size: 9,305 Bytes
3b85924
 
 
7236411
 
1a0701e
3c83061
 
 
1a0701e
 
3b85924
 
704cfba
1a0701e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3b85924
 
7236411
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a0701e
3b85924
 
 
 
1a0701e
3b85924
 
1a0701e
3b85924
 
 
 
 
 
 
7236411
 
1a0701e
 
3b85924
1a0701e
3b85924
1a0701e
 
7236411
1a0701e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7236411
1a0701e
 
7236411
 
 
3b85924
 
 
1a0701e
3b85924
 
 
 
1a0701e
 
3b85924
 
1a0701e
3b85924
7236411
3b85924
 
 
 
7236411
3b85924
 
1a0701e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from typing import List, Tuple
import torch
from SciAssist import Summarization
import os
import requests
from datasets import load_dataset
print(f"Is CUDA available: {torch.cuda.is_available()}")
# True
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

acl_data = load_dataset("dyxohjl666/CocoScisum_ACL", revision="refs/convert/parquet")
device = "gpu" if torch.cuda.is_available() else "cpu"

ctrlsum_pipeline = Summarization(os_name="nt",model_name="flan-t5-xl",checkpoint="dyxohjl666/flant5-xl-cocoscisum",device=device)

acl_dict = {}
recommended_kw = {}


def convert_to_dict(data):
    """ Dict:
        { url:
            {length:
                {keywords: summary};
             raw_text:
                 str;
            }
        }

    """
    url = data["url"]
    text = data["text"]
    keywords = data["keywords"]
    length = data["length"]
    summary = data["summary"]
    for u, t, k, l, s in zip(url, text, keywords, length, summary):
        if len(u) < 5:
            continue
        u = u + ".pdf"
        if k == None:
            k = ""
        if l == None:
            l = ""
        k = str(k).strip()
        l = str(l).strip()
        if u in acl_dict.keys():
            if k in acl_dict[u][l].keys():
                continue
            else:
                acl_dict[u][l][k] = s
        else:
            acl_dict[u] = {"": {}, "50": {}, "100": {}, "200": {}, "raw_text": t}

        # kws
        if u in recommended_kw.keys():
            if k == "" or k in recommended_kw[u]:
                continue
            else:
                recommended_kw[u].append(k)
        else:
            recommended_kw[u] = []
    return 1


for i in acl_data.keys():
    signal = convert_to_dict(acl_data[i])


def download_pdf(url, dest_folder):
    """
    Download a PDF from a given URL and save it to a specified destination folder.
    Parameters:
        url (str): URL of the PDF
        dest_folder (str): Destination folder to save the downloaded PDF
    """

    if not os.path.exists(dest_folder):
        os.makedirs(dest_folder)

    response = requests.get(url, stream=True)
    filename = os.path.join(dest_folder, url.split("/")[-1])

    with open(filename, 'wb') as file:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                file.write(chunk)
    print(f"Downloaded {url} to {filename}")
    return filename


def ctrlsum_for_str(input, length=None, keywords=None) -> List[Tuple[str, str]]:
    if keywords is not None:
        keywords = keywords.strip().split(",")
        if keywords[0] == "":
            keywords = None
    if length == 0 or length is None:
        length = None
    results = ctrlsum_pipeline.predict(input, type="str",
                                       length=length, keywords=keywords, num_beams=1)

    output = []
    for res in results["summary"]:
        output.append(f"{res}\n\n")
    return "".join(output)


def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]:
    if input == None and url == "":
        if text == "":
            return None, "Input cannot be left blank.", None
        else:
            return ctrlsum_for_str(text, length, keywords), text, None
    else:
        filename = ""
        url = url.strip()
        if url != "":
            if len(url) > 4 and url[-3:] == "pdf":
                if url.strip() in acl_dict.keys():
                    raw_text = acl_dict[url]["raw_text"]
                    l = str(length)
                    if length == 0:
                        l = ""
                    if l in acl_dict[url].keys():
                        if keywords.strip() in acl_dict[url][l].keys():
                            summary = acl_dict[url][l][keywords]
                            return summary, raw_text, None
                    if keywords.strip() == "":
                        keywords = None
                    if l == "":
                        l = None
                    return ctrlsum_for_str(raw_text, l, keywords), raw_text, None

                filename = download_pdf(url, './cache/')
            else:
                "Invalid url(Not PDF)!", None, None
        else:
            filename = input.name
        if keywords != "":
            keywords = keywords.strip().split(",")
            if keywords[0] == "":
                keywords = None
        if length == 0:
            length = None
        # Identify the format of input and parse reference strings
        if filename[-4:] == ".txt":
            results = ctrlsum_pipeline.predict(filename, type="txt",
                                               save_results=False,
                                               length=length, keywords=keywords, num_beams=1)
        elif filename[-4:] == ".pdf":
            results = ctrlsum_pipeline.predict(filename,
                                               save_results=False, length=length, keywords=keywords, num_beams=1)
        else:
            return "File Format Error !", None, filename

        output = []
        for res in results["summary"]:
            output.append(f"{res}\n\n")
        return "".join(output), results["raw_text"], filename


ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "