Spaces:
Runtime error
Runtime error
update
Browse files- app.py +31 -14
- input_format.py +1 -16
- score.py +23 -12
app.py
CHANGED
@@ -28,7 +28,7 @@ def get_similar_paper(
|
|
28 |
author_id_input,
|
29 |
num_papers_show=10
|
30 |
):
|
31 |
-
print('
|
32 |
input_sentences = sent_tokenize(abstract_text_input)
|
33 |
|
34 |
# TODO handle pdf file input
|
@@ -41,8 +41,8 @@ def get_similar_paper(
|
|
41 |
name, papers = get_text_from_author_id(author_id_input)
|
42 |
|
43 |
# Compute Doc-level affinity scores for the Papers
|
44 |
-
print('
|
45 |
-
titles, abstracts, doc_scores =
|
46 |
doc_model,
|
47 |
tokenizer,
|
48 |
abstract_text_input,
|
@@ -63,9 +63,15 @@ def get_similar_paper(
|
|
63 |
doc_scores = doc_scores[:num_papers_show]
|
64 |
|
65 |
display_title = ['[ %0.3f ] %s'%(s, t) for t, s in zip(titles, doc_scores)]
|
66 |
-
print('
|
67 |
-
|
68 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
def get_highlights(
|
71 |
abstract_text_input,
|
@@ -73,7 +79,7 @@ def get_highlights(
|
|
73 |
abstract,
|
74 |
K=2
|
75 |
):
|
76 |
-
print('
|
77 |
# Compute sent-level and phrase-level affinity scores for each papers
|
78 |
sent_ids, sent_scores, info = get_highlight_info(
|
79 |
sent_model,
|
@@ -86,18 +92,20 @@ def get_highlights(
|
|
86 |
num_sents = len(input_sentences)
|
87 |
|
88 |
word_scores = dict()
|
89 |
-
|
|
|
90 |
for i in range(num_sents):
|
91 |
word_scores[str(i)] = {
|
92 |
"original": abstract,
|
93 |
"interpretation": list(zip(info['all_words'], info[i]['scores']))
|
94 |
-
}
|
95 |
|
96 |
tmp = {
|
97 |
'source_sentences': input_sentences,
|
98 |
'highlight': word_scores
|
99 |
}
|
100 |
pickle.dump(tmp, open('highlight_info.pkl', 'wb'))
|
|
|
101 |
|
102 |
# update the visibility of radio choices
|
103 |
return gr.update(visible=True)
|
@@ -105,11 +113,12 @@ def get_highlights(
|
|
105 |
def update_name(author_id_input):
|
106 |
# update the name of the author based on the id input
|
107 |
name, _ = get_text_from_author_id(author_id_input)
|
|
|
108 |
return gr.update(value=name)
|
109 |
|
110 |
def change_output_highlight(source_sent_choice):
|
111 |
-
fname = 'highlight_info.pkl'
|
112 |
# change the output highlight based on the sentence selected from the submission
|
|
|
113 |
if os.path.exists(fname):
|
114 |
tmp = pickle.load(open(fname, 'rb'))
|
115 |
source_sents = tmp['source_sentences']
|
@@ -122,7 +131,7 @@ def change_output_highlight(source_sent_choice):
|
|
122 |
return
|
123 |
|
124 |
def change_paper(selected_papers_radio):
|
125 |
-
# change the paper to show
|
126 |
fname = 'paper_info.pkl'
|
127 |
if os.path.exists(fname):
|
128 |
tmp = pickle.load(open(fname, 'rb'))
|
@@ -130,7 +139,7 @@ def change_paper(selected_papers_radio):
|
|
130 |
display_title = '[ %0.3f ] %s'%(aff_score, title)
|
131 |
if display_title == selected_papers_radio:
|
132 |
#print('changing paper')
|
133 |
-
return title, abstract, aff_score
|
134 |
else:
|
135 |
return
|
136 |
|
@@ -150,7 +159,9 @@ with gr.Blocks() as demo:
|
|
150 |
author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
|
151 |
with gr.Row():
|
152 |
compute_btn = gr.Button('Search Similar Papers from the Reviewer')
|
153 |
-
|
|
|
|
|
154 |
# show multiple papers in radio check box to select from
|
155 |
with gr.Row():
|
156 |
selected_papers_radio = gr.Radio(
|
@@ -159,7 +170,7 @@ with gr.Blocks() as demo:
|
|
159 |
label='Selected Top Papers from the Reviewer'
|
160 |
)
|
161 |
|
162 |
-
|
163 |
with gr.Row(visible=False) as title_row:
|
164 |
with gr.Column(scale=3):
|
165 |
paper_title = gr.Textbox(label='Title', interactive=False)
|
@@ -183,6 +194,9 @@ with gr.Blocks() as demo:
|
|
183 |
with gr.Column(scale=3): # highlighted text from paper
|
184 |
highlight = gr.components.Interpretation(paper_abstract)
|
185 |
|
|
|
|
|
|
|
186 |
compute_btn.click(
|
187 |
fn=get_similar_paper,
|
188 |
inputs=[
|
@@ -199,6 +213,7 @@ with gr.Blocks() as demo:
|
|
199 |
]
|
200 |
)
|
201 |
|
|
|
202 |
explain_btn.click(
|
203 |
fn=get_highlights,
|
204 |
inputs=[
|
@@ -209,12 +224,14 @@ with gr.Blocks() as demo:
|
|
209 |
outputs=source_sentences
|
210 |
)
|
211 |
|
|
|
212 |
source_sentences.change(
|
213 |
fn=change_output_highlight,
|
214 |
inputs=source_sentences,
|
215 |
outputs=highlight
|
216 |
)
|
217 |
|
|
|
218 |
selected_papers_radio.change(
|
219 |
fn=change_paper,
|
220 |
inputs=selected_papers_radio,
|
|
|
28 |
author_id_input,
|
29 |
num_papers_show=10
|
30 |
):
|
31 |
+
print('retrieving similar papers')
|
32 |
input_sentences = sent_tokenize(abstract_text_input)
|
33 |
|
34 |
# TODO handle pdf file input
|
|
|
41 |
name, papers = get_text_from_author_id(author_id_input)
|
42 |
|
43 |
# Compute Doc-level affinity scores for the Papers
|
44 |
+
print('computing scores')
|
45 |
+
titles, abstracts, doc_scores = compute_document_score(
|
46 |
doc_model,
|
47 |
tokenizer,
|
48 |
abstract_text_input,
|
|
|
63 |
doc_scores = doc_scores[:num_papers_show]
|
64 |
|
65 |
display_title = ['[ %0.3f ] %s'%(s, t) for t, s in zip(titles, doc_scores)]
|
66 |
+
print('retrieval done')
|
67 |
+
|
68 |
+
return (
|
69 |
+
gr.update(choices=display_title, interactive=True, visible=True), # set of papers
|
70 |
+
gr.update(choices=input_sentences, interactive=True), # submission sentences
|
71 |
+
gr.update(visible=True), # title row
|
72 |
+
gr.update(visible=True), # abstract row
|
73 |
+
gr.update(visible=True) # button
|
74 |
+
)
|
75 |
|
76 |
def get_highlights(
|
77 |
abstract_text_input,
|
|
|
79 |
abstract,
|
80 |
K=2
|
81 |
):
|
82 |
+
print('obtaining highlights')
|
83 |
# Compute sent-level and phrase-level affinity scores for each papers
|
84 |
sent_ids, sent_scores, info = get_highlight_info(
|
85 |
sent_model,
|
|
|
92 |
num_sents = len(input_sentences)
|
93 |
|
94 |
word_scores = dict()
|
95 |
+
|
96 |
+
# different highlights for each input sentence
|
97 |
for i in range(num_sents):
|
98 |
word_scores[str(i)] = {
|
99 |
"original": abstract,
|
100 |
"interpretation": list(zip(info['all_words'], info[i]['scores']))
|
101 |
+
} # format to feed to for Gradio Interpretation component
|
102 |
|
103 |
tmp = {
|
104 |
'source_sentences': input_sentences,
|
105 |
'highlight': word_scores
|
106 |
}
|
107 |
pickle.dump(tmp, open('highlight_info.pkl', 'wb'))
|
108 |
+
print('done')
|
109 |
|
110 |
# update the visibility of radio choices
|
111 |
return gr.update(visible=True)
|
|
|
113 |
def update_name(author_id_input):
|
114 |
# update the name of the author based on the id input
|
115 |
name, _ = get_text_from_author_id(author_id_input)
|
116 |
+
|
117 |
return gr.update(value=name)
|
118 |
|
119 |
def change_output_highlight(source_sent_choice):
|
|
|
120 |
# change the output highlight based on the sentence selected from the submission
|
121 |
+
fname = 'highlight_info.pkl'
|
122 |
if os.path.exists(fname):
|
123 |
tmp = pickle.load(open(fname, 'rb'))
|
124 |
source_sents = tmp['source_sentences']
|
|
|
131 |
return
|
132 |
|
133 |
def change_paper(selected_papers_radio):
|
134 |
+
# change the paper to show based on the paper selected
|
135 |
fname = 'paper_info.pkl'
|
136 |
if os.path.exists(fname):
|
137 |
tmp = pickle.load(open(fname, 'rb'))
|
|
|
139 |
display_title = '[ %0.3f ] %s'%(aff_score, title)
|
140 |
if display_title == selected_papers_radio:
|
141 |
#print('changing paper')
|
142 |
+
return title, abstract, aff_score # update title, abstract, and affinity score fields
|
143 |
else:
|
144 |
return
|
145 |
|
|
|
159 |
author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
|
160 |
with gr.Row():
|
161 |
compute_btn = gr.Button('Search Similar Papers from the Reviewer')
|
162 |
+
|
163 |
+
### PAPER INFORMATION
|
164 |
+
|
165 |
# show multiple papers in radio check box to select from
|
166 |
with gr.Row():
|
167 |
selected_papers_radio = gr.Radio(
|
|
|
170 |
label='Selected Top Papers from the Reviewer'
|
171 |
)
|
172 |
|
173 |
+
# selected paper information
|
174 |
with gr.Row(visible=False) as title_row:
|
175 |
with gr.Column(scale=3):
|
176 |
paper_title = gr.Textbox(label='Title', interactive=False)
|
|
|
194 |
with gr.Column(scale=3): # highlighted text from paper
|
195 |
highlight = gr.components.Interpretation(paper_abstract)
|
196 |
|
197 |
+
### EVENT LISTENERS
|
198 |
+
|
199 |
+
# retrieve similar papers
|
200 |
compute_btn.click(
|
201 |
fn=get_similar_paper,
|
202 |
inputs=[
|
|
|
213 |
]
|
214 |
)
|
215 |
|
216 |
+
# get highlights
|
217 |
explain_btn.click(
|
218 |
fn=get_highlights,
|
219 |
inputs=[
|
|
|
224 |
outputs=source_sentences
|
225 |
)
|
226 |
|
227 |
+
# change highlight based on selected sentences from submission
|
228 |
source_sentences.change(
|
229 |
fn=change_output_highlight,
|
230 |
inputs=source_sentences,
|
231 |
outputs=highlight
|
232 |
)
|
233 |
|
234 |
+
# change paper to show based on selected papers
|
235 |
selected_papers_radio.change(
|
236 |
fn=change_paper,
|
237 |
inputs=selected_papers_radio,
|
input_format.py
CHANGED
@@ -94,19 +94,4 @@ def get_introduction(text):
|
|
94 |
pass
|
95 |
|
96 |
def get_conclusion(text):
|
97 |
-
pass
|
98 |
-
|
99 |
-
|
100 |
-
if __name__ == '__main__':
|
101 |
-
def run_sample():
|
102 |
-
url = 'https://arxiv.org/abs/2105.06506'
|
103 |
-
text = get_text_from_url(url)
|
104 |
-
assert(text[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')
|
105 |
-
|
106 |
-
text2 = get_text_from_url('https://arxiv.org/pdf/2105.06506.pdf')
|
107 |
-
assert(text2[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')
|
108 |
-
|
109 |
-
# text = get_text_from_url('https://arxiv.org/paetseths.pdf')
|
110 |
-
|
111 |
-
# test the code
|
112 |
-
run_sample()
|
|
|
94 |
pass
|
95 |
|
96 |
def get_conclusion(text):
|
97 |
+
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
score.py
CHANGED
@@ -5,16 +5,16 @@ import torch
|
|
5 |
import numpy as np
|
6 |
|
7 |
def compute_sentencewise_scores(model, query_sents, candidate_sents):
|
|
|
8 |
# list of sentences from query and candidate
|
9 |
-
|
10 |
q_v, c_v = get_embedding(model, query_sents, candidate_sents)
|
|
|
11 |
return util.cos_sim(q_v, c_v)
|
12 |
|
13 |
def get_embedding(model, query_sents, candidate_sents):
|
14 |
-
|
15 |
q_v = model.encode(query_sents)
|
16 |
c_v = model.encode(candidate_sents)
|
17 |
-
|
18 |
return q_v, c_v
|
19 |
|
20 |
def get_top_k(score_mat, K=3):
|
@@ -30,6 +30,10 @@ def get_top_k(score_mat, K=3):
|
|
30 |
return picked_sent, picked_scores
|
31 |
|
32 |
def get_words(sent):
|
|
|
|
|
|
|
|
|
33 |
words = []
|
34 |
sent_start_id = [] # keep track of the word index where the new sentence starts
|
35 |
counter = 0
|
@@ -48,8 +52,10 @@ def get_words(sent):
|
|
48 |
return words, all_words, sent_start_id
|
49 |
|
50 |
def get_match_phrase(w1, w2):
|
51 |
-
|
52 |
-
|
|
|
|
|
53 |
# POS tags that should be considered for matching phrase
|
54 |
include = [
|
55 |
'JJ',
|
@@ -80,6 +86,9 @@ def get_match_phrase(w1, w2):
|
|
80 |
return mask2
|
81 |
|
82 |
def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
|
|
|
|
|
|
|
83 |
num_query_sent = sent_ids.shape[0]
|
84 |
num_words = len(all_words)
|
85 |
|
@@ -121,6 +130,9 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
|
|
121 |
return output
|
122 |
|
123 |
def get_highlight_info(model, text1, text2, K=None):
|
|
|
|
|
|
|
124 |
sent1 = sent_tokenize(text1) # query
|
125 |
sent2 = sent_tokenize(text2) # candidate
|
126 |
if K is None: # if K is not set, select based on the length of the candidate
|
@@ -128,15 +140,15 @@ def get_highlight_info(model, text1, text2, K=None):
|
|
128 |
score_mat = compute_sentencewise_scores(model, sent1, sent2)
|
129 |
|
130 |
sent_ids, sent_scores = get_top_k(score_mat, K=K)
|
131 |
-
#print(sent_ids, sent_scores)
|
132 |
words2, all_words2, sent_start_id2 = get_words(sent2)
|
133 |
-
#print(all_words1, sent_start_id1)
|
134 |
info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
|
135 |
|
136 |
return sent_ids, sent_scores, info
|
137 |
|
138 |
-
|
|
|
139 |
def predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=20):
|
|
|
140 |
|
141 |
# concatenate title and abstract
|
142 |
title_abs = []
|
@@ -146,12 +158,11 @@ def predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=20):
|
|
146 |
|
147 |
num_docs = len(title_abs)
|
148 |
no_iter = int(np.ceil(num_docs / batch))
|
149 |
-
|
150 |
-
# preprocess the input
|
151 |
scores = []
|
152 |
with torch.no_grad():
|
153 |
-
# batch
|
154 |
for i in range(no_iter):
|
|
|
155 |
inputs = tokenizer(
|
156 |
[query] + title_abs[i*batch:(i+1)*batch],
|
157 |
padding=True,
|
@@ -175,7 +186,7 @@ def predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=20):
|
|
175 |
|
176 |
return scores
|
177 |
|
178 |
-
def
|
179 |
scores = []
|
180 |
titles = []
|
181 |
abstracts = []
|
|
|
5 |
import numpy as np
|
6 |
|
7 |
def compute_sentencewise_scores(model, query_sents, candidate_sents):
|
8 |
+
# TODO make this more general for different types of models
|
9 |
# list of sentences from query and candidate
|
|
|
10 |
q_v, c_v = get_embedding(model, query_sents, candidate_sents)
|
11 |
+
|
12 |
return util.cos_sim(q_v, c_v)
|
13 |
|
14 |
def get_embedding(model, query_sents, candidate_sents):
|
|
|
15 |
q_v = model.encode(query_sents)
|
16 |
c_v = model.encode(candidate_sents)
|
17 |
+
|
18 |
return q_v, c_v
|
19 |
|
20 |
def get_top_k(score_mat, K=3):
|
|
|
30 |
return picked_sent, picked_scores
|
31 |
|
32 |
def get_words(sent):
|
33 |
+
"""
|
34 |
+
Input: list of sentences
|
35 |
+
Output: list of list of words per sentence, all words in, index of starting words for each sentence
|
36 |
+
"""
|
37 |
words = []
|
38 |
sent_start_id = [] # keep track of the word index where the new sentence starts
|
39 |
counter = 0
|
|
|
52 |
return words, all_words, sent_start_id
|
53 |
|
54 |
def get_match_phrase(w1, w2):
|
55 |
+
"""
|
56 |
+
Input: list of words for query and candidate text
|
57 |
+
Output: word list and binary mask of matching phrases between the inputs
|
58 |
+
"""
|
59 |
# POS tags that should be considered for matching phrase
|
60 |
include = [
|
61 |
'JJ',
|
|
|
86 |
return mask2
|
87 |
|
88 |
def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
|
89 |
+
"""
|
90 |
+
Mark the words that are highlighted, both by in terms of sentence and phrase
|
91 |
+
"""
|
92 |
num_query_sent = sent_ids.shape[0]
|
93 |
num_words = len(all_words)
|
94 |
|
|
|
130 |
return output
|
131 |
|
132 |
def get_highlight_info(model, text1, text2, K=None):
|
133 |
+
"""
|
134 |
+
Get highlight information from two texts
|
135 |
+
"""
|
136 |
sent1 = sent_tokenize(text1) # query
|
137 |
sent2 = sent_tokenize(text2) # candidate
|
138 |
if K is None: # if K is not set, select based on the length of the candidate
|
|
|
140 |
score_mat = compute_sentencewise_scores(model, sent1, sent2)
|
141 |
|
142 |
sent_ids, sent_scores = get_top_k(score_mat, K=K)
|
|
|
143 |
words2, all_words2, sent_start_id2 = get_words(sent2)
|
|
|
144 |
info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
|
145 |
|
146 |
return sent_ids, sent_scores, info
|
147 |
|
148 |
+
### Document-level operations
|
149 |
+
|
150 |
def predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=20):
|
151 |
+
# compute document scores for each papers
|
152 |
|
153 |
# concatenate title and abstract
|
154 |
title_abs = []
|
|
|
158 |
|
159 |
num_docs = len(title_abs)
|
160 |
no_iter = int(np.ceil(num_docs / batch))
|
|
|
|
|
161 |
scores = []
|
162 |
with torch.no_grad():
|
163 |
+
# batch
|
164 |
for i in range(no_iter):
|
165 |
+
# preprocess the input
|
166 |
inputs = tokenizer(
|
167 |
[query] + title_abs[i*batch:(i+1)*batch],
|
168 |
padding=True,
|
|
|
186 |
|
187 |
return scores
|
188 |
|
189 |
+
def compute_document_score(doc_model, tokenizer, query, papers, batch=5):
|
190 |
scores = []
|
191 |
titles = []
|
192 |
abstracts = []
|