Spaces:

jskim
/

paper-matching

Runtime error

App Files Files Community

jskim commited on Feb 27, 2023

Commit

963bf46

•

1 Parent(s): b1499f3

update

Browse files

Files changed (3) hide show

app.py +31 -14
input_format.py +1 -16
score.py +23 -12

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ def get_similar_paper(
     author_id_input,
     num_papers_show=10
 ):
-    print('-- retrieving similar papers')
     input_sentences = sent_tokenize(abstract_text_input)
     # TODO handle pdf file input
@@ -41,8 +41,8 @@ def get_similar_paper(
         name, papers = get_text_from_author_id(author_id_input)
     # Compute Doc-level affinity scores for the Papers
-    print('---- computing scores')
-    titles, abstracts, doc_scores = compute_overall_score(
         doc_model,
         tokenizer,
         abstract_text_input,
@@ -63,9 +63,15 @@ def get_similar_paper(
     doc_scores = doc_scores[:num_papers_show]
     display_title = ['[ %0.3f ] %s'%(s, t) for t, s in zip(titles, doc_scores)]
-    print('----- done')
-    return gr.update(choices=display_title, interactive=True, visible=True), gr.update(choices=input_sentences, interactive=True), gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
 def get_highlights(
     abstract_text_input,
@@ -73,7 +79,7 @@ def get_highlights(
     abstract,
     K=2
 ):
-    print('-- obtaining highlights')
     # Compute sent-level and phrase-level affinity scores for each papers
     sent_ids, sent_scores, info = get_highlight_info(
         sent_model,
@@ -86,18 +92,20 @@ def get_highlights(
     num_sents = len(input_sentences)
     word_scores = dict()
-    # different highlights for each input sentences
     for i in range(num_sents):
         word_scores[str(i)] = {
             "original": abstract,
             "interpretation": list(zip(info['all_words'], info[i]['scores']))
-        }
     tmp = {
         'source_sentences': input_sentences,
         'highlight': word_scores
     }
     pickle.dump(tmp, open('highlight_info.pkl', 'wb'))
     # update the visibility of radio choices
     return gr.update(visible=True)
@@ -105,11 +113,12 @@ def get_highlights(
 def update_name(author_id_input):
     # update the name of the author based on the id input
     name, _ = get_text_from_author_id(author_id_input)
     return gr.update(value=name)
 def change_output_highlight(source_sent_choice):
-    fname = 'highlight_info.pkl'
     # change the output highlight based on the sentence selected from the submission
     if os.path.exists(fname):
         tmp = pickle.load(open(fname, 'rb'))
         source_sents = tmp['source_sentences']
@@ -122,7 +131,7 @@ def change_output_highlight(source_sent_choice):
         return
 def change_paper(selected_papers_radio):
-    # change the paper to show
     fname = 'paper_info.pkl'
     if os.path.exists(fname):
         tmp = pickle.load(open(fname, 'rb'))
@@ -130,7 +139,7 @@ def change_paper(selected_papers_radio):
             display_title = '[ %0.3f ] %s'%(aff_score, title)
             if display_title == selected_papers_radio:
                 #print('changing paper')
-                return title, abstract, aff_score
     else:
         return
@@ -150,7 +159,9 @@ with gr.Blocks() as demo:
                 author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
     with gr.Row():
         compute_btn = gr.Button('Search Similar Papers from the Reviewer')
     # show multiple papers in radio check box to select from
     with gr.Row():
         selected_papers_radio = gr.Radio(
@@ -159,7 +170,7 @@ with gr.Blocks() as demo:
             label='Selected Top Papers from the Reviewer'
         )
-    ### PAPER INFORMATION
     with gr.Row(visible=False) as title_row:
         with gr.Column(scale=3):
             paper_title = gr.Textbox(label='Title', interactive=False)
@@ -183,6 +194,9 @@ with gr.Blocks() as demo:
         with gr.Column(scale=3): # highlighted text from paper
             highlight = gr.components.Interpretation(paper_abstract)
     compute_btn.click(
         fn=get_similar_paper,
         inputs=[
@@ -199,6 +213,7 @@ with gr.Blocks() as demo:
         ]
     )
     explain_btn.click(
         fn=get_highlights,
         inputs=[
@@ -209,12 +224,14 @@ with gr.Blocks() as demo:
         outputs=source_sentences
     )
     source_sentences.change(
         fn=change_output_highlight,
         inputs=source_sentences,
         outputs=highlight
     )
     selected_papers_radio.change(
         fn=change_paper,
         inputs=selected_papers_radio,

     author_id_input,
     num_papers_show=10
 ):
+    print('retrieving similar papers')
     input_sentences = sent_tokenize(abstract_text_input)
     # TODO handle pdf file input
         name, papers = get_text_from_author_id(author_id_input)
     # Compute Doc-level affinity scores for the Papers
+    print('computing scores')
+    titles, abstracts, doc_scores = compute_document_score(
         doc_model,
         tokenizer,
         abstract_text_input,
     doc_scores = doc_scores[:num_papers_show]
     display_title = ['[ %0.3f ] %s'%(s, t) for t, s in zip(titles, doc_scores)]
+    print('retrieval done')
+    return (
+        gr.update(choices=display_title, interactive=True, visible=True), # set of papers
+        gr.update(choices=input_sentences, interactive=True), # submission sentences
+        gr.update(visible=True),    # title row
+        gr.update(visible=True),    # abstract row
+        gr.update(visible=True)     # button
+    )
 def get_highlights(
     abstract_text_input,
     abstract,
     K=2
 ):
+    print('obtaining highlights')
     # Compute sent-level and phrase-level affinity scores for each papers
     sent_ids, sent_scores, info = get_highlight_info(
         sent_model,
     num_sents = len(input_sentences)
     word_scores = dict()
+    # different highlights for each input sentence
     for i in range(num_sents):
         word_scores[str(i)] = {
             "original": abstract,
             "interpretation": list(zip(info['all_words'], info[i]['scores']))
+        } # format to feed to for Gradio Interpretation component
     tmp = {
         'source_sentences': input_sentences,
         'highlight': word_scores
     }
     pickle.dump(tmp, open('highlight_info.pkl', 'wb'))
+    print('done')
     # update the visibility of radio choices
     return gr.update(visible=True)
 def update_name(author_id_input):
     # update the name of the author based on the id input
     name, _ = get_text_from_author_id(author_id_input)
     return gr.update(value=name)
 def change_output_highlight(source_sent_choice):
     # change the output highlight based on the sentence selected from the submission
+    fname = 'highlight_info.pkl'
     if os.path.exists(fname):
         tmp = pickle.load(open(fname, 'rb'))
         source_sents = tmp['source_sentences']
         return
 def change_paper(selected_papers_radio):
+    # change the paper to show based on the paper selected
     fname = 'paper_info.pkl'
     if os.path.exists(fname):
         tmp = pickle.load(open(fname, 'rb'))
             display_title = '[ %0.3f ] %s'%(aff_score, title)
             if display_title == selected_papers_radio:
                 #print('changing paper')
+                return title, abstract, aff_score   # update title, abstract, and affinity score fields
     else:
         return
                 author_id_input.change(fn=update_name, inputs=author_id_input, outputs=name)
     with gr.Row():
         compute_btn = gr.Button('Search Similar Papers from the Reviewer')
+    ### PAPER INFORMATION
     # show multiple papers in radio check box to select from
     with gr.Row():
         selected_papers_radio = gr.Radio(
             label='Selected Top Papers from the Reviewer'
         )
+    # selected paper information
     with gr.Row(visible=False) as title_row:
         with gr.Column(scale=3):
             paper_title = gr.Textbox(label='Title', interactive=False)
         with gr.Column(scale=3): # highlighted text from paper
             highlight = gr.components.Interpretation(paper_abstract)
+    ### EVENT LISTENERS
+    # retrieve similar papers
     compute_btn.click(
         fn=get_similar_paper,
         inputs=[
         ]
     )
+    # get highlights
     explain_btn.click(
         fn=get_highlights,
         inputs=[
         outputs=source_sentences
     )
+    # change highlight based on selected sentences from submission
     source_sentences.change(
         fn=change_output_highlight,
         inputs=source_sentences,
         outputs=highlight
     )
+    # change paper to show based on selected papers
     selected_papers_radio.change(
         fn=change_paper,
         inputs=selected_papers_radio,

input_format.py CHANGED Viewed

@@ -94,19 +94,4 @@ def get_introduction(text):
     pass
 def get_conclusion(text):
-    pass
-if __name__ == '__main__':
-    def run_sample():
-        url = 'https://arxiv.org/abs/2105.06506'
-        text = get_text_from_url(url)
-        assert(text[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')
-        text2 = get_text_from_url('https://arxiv.org/pdf/2105.06506.pdf')
-        assert(text2[0].split('\n')[0] == 'Sanity Simulations for Saliency Methods')
-        # text = get_text_from_url('https://arxiv.org/paetseths.pdf')
-    # test the code
-    run_sample()

     pass
 def get_conclusion(text):
+    pass

score.py CHANGED Viewed

@@ -5,16 +5,16 @@ import torch
 import numpy as np
 def compute_sentencewise_scores(model, query_sents, candidate_sents):
     # list of sentences from query and candidate
     q_v, c_v = get_embedding(model, query_sents, candidate_sents)
     return util.cos_sim(q_v, c_v)
 def get_embedding(model, query_sents, candidate_sents):
     q_v = model.encode(query_sents)
     c_v = model.encode(candidate_sents)
     return q_v, c_v
 def get_top_k(score_mat, K=3):
@@ -30,6 +30,10 @@ def get_top_k(score_mat, K=3):
     return picked_sent, picked_scores
 def get_words(sent):
     words = []
     sent_start_id = [] # keep track of the word index where the new sentence starts
     counter = 0
@@ -48,8 +52,10 @@ def get_words(sent):
     return words, all_words, sent_start_id
 def get_match_phrase(w1, w2):
-    # list of words for query and candidate as input
-    # return the word list and binary mask of matching phrases
     # POS tags that should be considered for matching phrase
     include = [
         'JJ',
@@ -80,6 +86,9 @@ def get_match_phrase(w1, w2):
     return mask2
 def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
     num_query_sent = sent_ids.shape[0]
     num_words = len(all_words)
@@ -121,6 +130,9 @@ def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scor
     return output
 def get_highlight_info(model, text1, text2, K=None):
     sent1 = sent_tokenize(text1) # query
     sent2 = sent_tokenize(text2) # candidate
     if K is None: # if K is not set, select based on the length of the candidate
@@ -128,15 +140,15 @@ def get_highlight_info(model, text1, text2, K=None):
     score_mat = compute_sentencewise_scores(model, sent1, sent2)
     sent_ids, sent_scores = get_top_k(score_mat, K=K)
-    #print(sent_ids, sent_scores)
     words2, all_words2, sent_start_id2 = get_words(sent2)
-    #print(all_words1, sent_start_id1)
     info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
     return sent_ids, sent_scores, info
-## Document-level operations
 def predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=20):
     # concatenate title and abstract
     title_abs = []
@@ -146,12 +158,11 @@ def predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=20):
     num_docs = len(title_abs)
     no_iter = int(np.ceil(num_docs / batch))
-    # preprocess the input
     scores = []
     with torch.no_grad():
-        # batch
         for i in range(no_iter):
             inputs = tokenizer(
                 [query] + title_abs[i*batch:(i+1)*batch],
                 padding=True,
@@ -175,7 +186,7 @@ def predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=20):
     return scores
-def compute_overall_score(doc_model, tokenizer, query, papers, batch=5):
     scores = []
     titles = []
     abstracts = []

 import numpy as np
 def compute_sentencewise_scores(model, query_sents, candidate_sents):
+    # TODO make this more general for different types of models
     # list of sentences from query and candidate
     q_v, c_v = get_embedding(model, query_sents, candidate_sents)
     return util.cos_sim(q_v, c_v)
 def get_embedding(model, query_sents, candidate_sents):
     q_v = model.encode(query_sents)
     c_v = model.encode(candidate_sents)
     return q_v, c_v
 def get_top_k(score_mat, K=3):
     return picked_sent, picked_scores
 def get_words(sent):
+    """
+    Input: list of sentences
+    Output: list of list of words per sentence, all words in, index of starting words for each sentence
+    """
     words = []
     sent_start_id = [] # keep track of the word index where the new sentence starts
     counter = 0
     return words, all_words, sent_start_id
 def get_match_phrase(w1, w2):
+    """
+    Input: list of words for query and candidate text
+    Output: word list and binary mask of matching phrases between the inputs
+    """
     # POS tags that should be considered for matching phrase
     include = [
         'JJ',
     return mask2
 def mark_words(query_sents, words, all_words, sent_start_id, sent_ids, sent_scores):
+    """
+    Mark the words that are highlighted, both by in terms of sentence and phrase
+    """
     num_query_sent = sent_ids.shape[0]
     num_words = len(all_words)
     return output
 def get_highlight_info(model, text1, text2, K=None):
+    """
+    Get highlight information from two texts
+    """
     sent1 = sent_tokenize(text1) # query
     sent2 = sent_tokenize(text2) # candidate
     if K is None: # if K is not set, select based on the length of the candidate
     score_mat = compute_sentencewise_scores(model, sent1, sent2)
     sent_ids, sent_scores = get_top_k(score_mat, K=K)
     words2, all_words2, sent_start_id2 = get_words(sent2)
     info = mark_words(sent1, words2, all_words2, sent_start_id2, sent_ids, sent_scores)
     return sent_ids, sent_scores, info
+### Document-level operations
 def predict_docscore(doc_model, tokenizer, query, titles, abstracts, batch=20):
+    # compute document scores for each papers
     # concatenate title and abstract
     title_abs = []
     num_docs = len(title_abs)
     no_iter = int(np.ceil(num_docs / batch))
     scores = []
     with torch.no_grad():
+        # batch
         for i in range(no_iter):
+            # preprocess the input
             inputs = tokenizer(
                 [query] + title_abs[i*batch:(i+1)*batch],
                 padding=True,
     return scores
+def compute_document_score(doc_model, tokenizer, query, papers, batch=5):
     scores = []
     titles = []
     abstracts = []