wing-nus dyxohjl666 commited on
Commit
1a0701e
1 Parent(s): ee6c452

precomputation (#19)

Browse files

- Add precomputing acl data (44257a91d79656a35a3eac347f3d60c5c66a1ef9)
- Add precomputing acl data (02d7398f2f7c165b2d5110ff7580990b0a4f19f5)


Co-authored-by: Yixi Ding <[email protected]>

Files changed (3) hide show
  1. app.py +37 -12
  2. controlled_summarization.py +87 -16
  3. requirements.txt +2 -1
app.py CHANGED
@@ -5,12 +5,13 @@ from reference_string_parsing import *
5
  from controlled_summarization import *
6
  from dataset_extraction import *
7
 
 
8
  import requests
9
 
10
  # Example Usage
11
- #url = "https://arxiv.org/pdf/2305.14996.pdf"
12
- #dest_folder = "./examples/"
13
- #download_pdf(url, dest_folder)
14
 
15
 
16
  with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
@@ -31,17 +32,20 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
31
  gr.Markdown("* Set the length of text used for summarization. Length 0 will exert no control over length.")
32
  # ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
33
  # ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
34
- ctrlsum_file_length = gr.Slider(0,300,step=50, label="Length")
35
- ctrlsum_file_keywords = gr.Textbox(label="Keywords",max_lines=1)
 
36
  with gr.Row():
37
  ctrlsum_file_btn = gr.Button("Generate")
38
  ctrlsum_file_output = gr.Textbox(
39
  elem_id="htext",
40
  label="Summary",
41
  )
42
- ctrlsum_file_examples = gr.Examples(examples=[["examples/H01-1042_body.txt", 50, "automatic evaluation technique", "",""],["examples/H01-1042.pdf", 0, "automatic evaluation technique","",""]],
43
- inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str, ctrlsum_url])
44
-
 
 
45
 
46
 
47
 
@@ -51,13 +55,34 @@ with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
51
  outputs=[ctrlsum_file_output, ctrlsum_str, ctrlsum_file]
52
  )
53
  def clear():
54
- return None,0,None, None
 
 
 
 
 
 
 
 
55
 
 
 
 
 
 
 
56
 
57
- ctrlsum_file.upload(clear, inputs=None,outputs=[ctrlsum_str,ctrlsum_file_length,ctrlsum_file_keywords, ctrlsum_url])
58
- ctrlsum_url.input(clear, inputs=None, outputs=[ctrlsum_str, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_file])
59
  ctrlsum_str.input(clear, inputs=None,
60
- outputs=[ctrlsum_url, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_file])
 
 
 
 
 
 
 
 
 
61
  # Reference String Parsing
62
  with gr.TabItem("Reference String Parsing"):
63
  gr.Markdown(rsp_title_md)
 
5
  from controlled_summarization import *
6
  from dataset_extraction import *
7
 
8
+ from controlled_summarization import recommended_kw
9
  import requests
10
 
11
  # Example Usage
12
+ # url = "https://arxiv.org/pdf/2305.14996.pdf"
13
+ # dest_folder = "./examples/"
14
+ # download_pdf(url, dest_folder)
15
 
16
 
17
  with gr.Blocks(css="#htext span {white-space: pre-line}") as demo:
 
32
  gr.Markdown("* Set the length of text used for summarization. Length 0 will exert no control over length.")
33
  # ctrlsum_file_beams = gr.Number(label="Number of beams for beam search", value=1, precision=0)
34
  # ctrlsum_file_sequences = gr.Number(label="Number of generated summaries", value=1, precision=0)
35
+ ctrlsum_file_length = gr.Radio(label="Length", value=0, choices=[0, 50, 100, 200])
36
+ kw = gr.Radio(visible=False)
37
+ ctrlsum_file_keywords = gr.Textbox(label="Keywords", max_lines=1)
38
  with gr.Row():
39
  ctrlsum_file_btn = gr.Button("Generate")
40
  ctrlsum_file_output = gr.Textbox(
41
  elem_id="htext",
42
  label="Summary",
43
  )
44
+ ctrlsum_file_examples = gr.Examples(
45
+ examples=[["examples/H01-1042_body.txt", 50, "automatic evaluation technique", "", ""],
46
+ ["examples/H01-1042.pdf", 0, "automatic evaluation technique", "", ""]],
47
+ inputs=[ctrlsum_file, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_str, ctrlsum_url
48
+ ])
49
 
50
 
51
 
 
55
  outputs=[ctrlsum_file_output, ctrlsum_str, ctrlsum_file]
56
  )
57
  def clear():
58
+ return None, 0, None, None, gr.Radio(visible=False)
59
+
60
+
61
+ def update_url(url):
62
+ if url in recommended_kw.keys():
63
+ keywords = recommended_kw[url]
64
+ if keywords != None:
65
+ return None, None, gr.Radio(choices=keywords[:3], label="Recommended Keywords", visible=True,
66
+ interactive=True)
67
 
68
+ return None, None, gr.Radio(visible=False)
69
+
70
+
71
+ ctrlsum_file.upload(clear, inputs=None,
72
+ outputs=[ctrlsum_str, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_url, kw])
73
+ ctrlsum_url.input(update_url, inputs=ctrlsum_url, outputs=[ctrlsum_str, ctrlsum_file, kw])
74
 
 
 
75
  ctrlsum_str.input(clear, inputs=None,
76
+ outputs=[ctrlsum_url, ctrlsum_file_length, ctrlsum_file_keywords, ctrlsum_file, kw])
77
+
78
+
79
+
80
+ def select_kw(env: gr.SelectData):
81
+ return env.value
82
+
83
+
84
+ kw.select(select_kw, None, ctrlsum_file_keywords)
85
+
86
  # Reference String Parsing
87
  with gr.TabItem("Reference String Parsing"):
88
  gr.Markdown(rsp_title_md)
controlled_summarization.py CHANGED
@@ -3,9 +3,64 @@ import torch
3
  from SciAssist import Summarization
4
  import os
5
  import requests
 
 
 
6
  device = "gpu" if torch.cuda.is_available() else "cpu"
7
 
8
- ctrlsum_pipeline = Summarization(os_name="nt",checkpoint="google/flan-t5-base",device=device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  def download_pdf(url, dest_folder):
@@ -30,16 +85,15 @@ def download_pdf(url, dest_folder):
30
  return filename
31
 
32
 
33
- def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
34
-
35
  if keywords is not None:
36
  keywords = keywords.strip().split(",")
37
  if keywords[0] == "":
38
  keywords = None
39
- if length==0 or length is None:
40
  length = None
41
  results = ctrlsum_pipeline.predict(input, type="str",
42
- length=length, keywords=keywords)
43
 
44
  output = []
45
  for res in results["summary"]:
@@ -49,31 +103,49 @@ def ctrlsum_for_str(input,length=None, keywords=None) -> List[Tuple[str, str]]:
49
 
50
  def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]:
51
  if input == None and url == "":
52
- if text=="":
53
- return None,"Input cannot be left blank.",None
54
  else:
55
- return ctrlsum_for_str(text,length,keywords),text, None
56
  else:
57
- filename=""
 
58
  if url != "":
59
- if len(url) > 4:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  filename = download_pdf(url, './cache/')
 
 
61
  else:
62
  filename = input.name
63
  if keywords != "":
64
  keywords = keywords.strip().split(",")
65
  if keywords[0] == "":
66
  keywords = None
67
- if length==0:
68
  length = None
69
  # Identify the format of input and parse reference strings
70
  if filename[-4:] == ".txt":
71
  results = ctrlsum_pipeline.predict(filename, type="txt",
72
- save_results=False,
73
- length=length, keywords=keywords)
74
  elif filename[-4:] == ".pdf":
75
  results = ctrlsum_pipeline.predict(filename,
76
- save_results=False, length=length, keywords=keywords)
77
  else:
78
  return "File Format Error !", None, filename
79
 
@@ -83,5 +155,4 @@ def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> L
83
  return "".join(output), results["raw_text"], filename
84
 
85
 
86
-
87
- ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
 
3
  from SciAssist import Summarization
4
  import os
5
  import requests
6
+ from datasets import load_dataset
7
+
8
+ acl_data = load_dataset("dyxohjl666/CocoScisum_ACL", revision="refs/convert/parquet")
9
  device = "gpu" if torch.cuda.is_available() else "cpu"
10
 
11
+ ctrlsum_pipeline = Summarization(os_name="nt",device=device)
12
+
13
+ acl_dict = {}
14
+ recommended_kw = {}
15
+
16
+
17
+ def convert_to_dict(data):
18
+ """ Dict:
19
+ { url:
20
+ {length:
21
+ {keywords: summary};
22
+ raw_text:
23
+ str;
24
+ }
25
+ }
26
+
27
+ """
28
+ url = data["url"]
29
+ text = data["text"]
30
+ keywords = data["keywords"]
31
+ length = data["length"]
32
+ summary = data["summary"]
33
+ for u, t, k, l, s in zip(url, text, keywords, length, summary):
34
+ if len(u) < 5:
35
+ continue
36
+ u = u + ".pdf"
37
+ if k == None:
38
+ k = ""
39
+ if l == None:
40
+ l = ""
41
+ k = str(k).strip()
42
+ l = str(l).strip()
43
+ if u in acl_dict.keys():
44
+ if k in acl_dict[u][l].keys():
45
+ continue
46
+ else:
47
+ acl_dict[u][l][k] = s
48
+ else:
49
+ acl_dict[u] = {"": {}, "50": {}, "100": {}, "200": {}, "raw_text": t}
50
+
51
+ # kws
52
+ if u in recommended_kw.keys():
53
+ if k == "" or k in recommended_kw[u]:
54
+ continue
55
+ else:
56
+ recommended_kw[u].append(k)
57
+ else:
58
+ recommended_kw[u] = []
59
+ return 1
60
+
61
+
62
+ for i in acl_data.keys():
63
+ signal = convert_to_dict(acl_data[i])
64
 
65
 
66
  def download_pdf(url, dest_folder):
 
85
  return filename
86
 
87
 
88
+ def ctrlsum_for_str(input, length=None, keywords=None) -> List[Tuple[str, str]]:
 
89
  if keywords is not None:
90
  keywords = keywords.strip().split(",")
91
  if keywords[0] == "":
92
  keywords = None
93
+ if length == 0 or length is None:
94
  length = None
95
  results = ctrlsum_pipeline.predict(input, type="str",
96
+ length=length, keywords=keywords, num_beams=1)
97
 
98
  output = []
99
  for res in results["summary"]:
 
103
 
104
  def ctrlsum_for_file(input=None, length=None, keywords="", text="", url="") -> List[Tuple[str, str, str]]:
105
  if input == None and url == "":
106
+ if text == "":
107
+ return None, "Input cannot be left blank.", None
108
  else:
109
+ return ctrlsum_for_str(text, length, keywords), text, None
110
  else:
111
+ filename = ""
112
+ url = url.strip()
113
  if url != "":
114
+ if len(url) > 4 and url[-3:] == "pdf":
115
+ if url.strip() in acl_dict.keys():
116
+ raw_text = acl_dict[url]["raw_text"]
117
+ l = str(length)
118
+ if length == 0:
119
+ l = ""
120
+ if l in acl_dict[url].keys():
121
+ if keywords.strip() in acl_dict[url][l].keys():
122
+ summary = acl_dict[url][l][keywords]
123
+ return summary, raw_text, None
124
+ if keywords.strip() == "":
125
+ keywords = None
126
+ if l == "":
127
+ l = None
128
+ return ctrlsum_for_str(raw_text, l, keywords), raw_text, None
129
+
130
  filename = download_pdf(url, './cache/')
131
+ else:
132
+ "Invalid url(Not PDF)!", None, None
133
  else:
134
  filename = input.name
135
  if keywords != "":
136
  keywords = keywords.strip().split(",")
137
  if keywords[0] == "":
138
  keywords = None
139
+ if length == 0:
140
  length = None
141
  # Identify the format of input and parse reference strings
142
  if filename[-4:] == ".txt":
143
  results = ctrlsum_pipeline.predict(filename, type="txt",
144
+ save_results=False,
145
+ length=length, keywords=keywords, num_beams=1)
146
  elif filename[-4:] == ".pdf":
147
  results = ctrlsum_pipeline.predict(filename,
148
+ save_results=False, length=length, keywords=keywords, num_beams=1)
149
  else:
150
  return "File Format Error !", None, filename
151
 
 
155
  return "".join(output), results["raw_text"], filename
156
 
157
 
158
+ ctrlsum_str_example = "Language model pre-training has been shown to be effective for improving many natural language processing tasks ( Dai and Le , 2015 ; Peters et al. , 2018a ; Radford et al. , 2018 ; Howard and Ruder , 2018 ) . These include sentence-level tasks such as natural language inference ( Bowman et al. , 2015 ; Williams et al. , 2018 ) and paraphrasing ( Dolan and Brockett , 2005 ) , which aim to predict the relationships between sentences by analyzing them holistically , as well as token-level tasks such as named entity recognition and question answering , where models are required to produce fine-grained output at the token level ( Tjong Kim Sang and De Meulder , 2003 ; Rajpurkar et al. , 2016 ) . There are two existing strategies for applying pre-trained language representations to downstream tasks : feature-based and fine-tuning . The feature-based approach , such as ELMo ( Peters et al. , 2018a ) , uses task-specific architectures that include the pre-trained representations as additional features . The fine-tuning approach , such as the Generative Pre-trained Transformer ( OpenAI GPT ) ( Radford et al. , 2018 ) , introduces minimal task-specific parameters , and is trained on the downstream tasks by simply fine-tuning all pretrained parameters . The two approaches share the same objective function during pre-training , where they use unidirectional language models to learn general language representations . We argue that current techniques restrict the power of the pre-trained representations , especially for the fine-tuning approaches . The major limitation is that standard language models are unidirectional , and this limits the choice of architectures that can be used during pre-training . For example , in OpenAI GPT , the authors use a left-toright architecture , where every token can only attend to previous tokens in the self-attention layers of the Transformer ( Vaswani et al. , 2017 ) . Such restrictions are sub-optimal for sentence-level tasks , and could be very harmful when applying finetuning based approaches to token-level tasks such as question answering , where it is crucial to incorporate context from both directions . In this paper , we improve the fine-tuning based approaches by proposing BERT : Bidirectional Encoder Representations from Transformers . BERT alleviates the previously mentioned unidirectionality constraint by using a `` masked language model '' ( MLM ) pre-training objective , inspired by the Cloze task ( Taylor , 1953 ) . The masked language model randomly masks some of the tokens from the input , and the objective is to predict the original vocabulary id of the masked arXiv:1810.04805v2 [ cs.CL ] 24 May 2019 word based only on its context . Unlike left-toright language model pre-training , the MLM objective enables the representation to fuse the left and the right context , which allows us to pretrain a deep bidirectional Transformer . In addition to the masked language model , we also use a `` next sentence prediction '' task that jointly pretrains text-pair representations . The contributions of our paper are as follows : • We demonstrate the importance of bidirectional pre-training for language representations . Unlike Radford et al . ( 2018 ) , which uses unidirectional language models for pre-training , BERT uses masked language models to enable pretrained deep bidirectional representations . This is also in contrast to Peters et al . ( 2018a ) , which uses a shallow concatenation of independently trained left-to-right and right-to-left LMs . • We show that pre-trained representations reduce the need for many heavily-engineered taskspecific architectures . BERT is the first finetuning based representation model that achieves state-of-the-art performance on a large suite of sentence-level and token-level tasks , outperforming many task-specific architectures . • BERT advances the state of the art for eleven NLP tasks . The code and pre-trained models are available at https : //github.com/ google-research/bert . "
 
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  pip==23.2.1
2
  torch==1.12.0
3
- SciAssist==0.0.41
4
  nltk~=3.7
 
 
1
  pip==23.2.1
2
  torch==1.12.0
3
+ SciAssist==0.1.3
4
  nltk~=3.7
5
+ pytest