# -*- coding: utf-8 -*- """preprocess.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1cs4nwWCMLuQOS1f9V6xbAG1kKtasrPs2 # Preprocessing """ # from google.colab import drive # drive.mount('/content/drive') import numpy as np import pandas as pd import json from itertools import islice # for slicing and dicing JSON records import os # def get_data(json_filename = 'arxiv-metadata-oai-snapshot.json', data_root = '/content/drive/MyDrive/카이스트/23봄/CS372/project'): # with open(data_root + '/' + 'dataset' + '/' + json_filename, "rb") as f: # for line in f: # yield line def get_data(json_filename='arxiv-metadata-oai-snapshot.json'): script_directory = os.path.dirname(os.path.abspath(__file__)) json_path = os.path.join(script_directory, json_filename) with open(json_path, "rb") as f: for line in f: yield line data_gen = get_data() def get_records(data_gen, chunksize=500): return [json.loads(record) for record in islice(data_gen, chunksize)] records_per_chunk = 250000 data_records = get_records(data_gen, records_per_chunk) def split_records(data_records, num_profiles=100, random_state=42): np.random.seed(random_state) np.random.shuffle(data_records) train_records, test_records = data_records[:num_profiles], data_records[num_profiles:] return train_records, test_records # Splitting the fetched records into train and test records train_records, test_records = split_records(data_records, num_profiles=500) # Utility method to generate dataframe from list of dictionaries def get_dataframe(list_of_dicts, columns=None): data = pd.DataFrame(list_of_dicts) if columns: data.columns = columns return data # Generating dataframes for train and test records train_df = get_dataframe(train_records) test_df = get_dataframe(test_records) # Utility method to filter out certain features which are of use def filter_features(data, features): return data[features] # Filtering the test dataframes for features we selected features = ['title', 'categories', 'abstract', 'update_date'] train_df = filter_features(train_df, features) test_df = filter_features(test_df, features) # define the corpus to pull from train_corpus = train_df['abstract'].head(10000) test_corpus = test_df['abstract'].head(10000) train_df.head() test_df.head() # train_df.to_csv('/content/drive/MyDrive/카이스트/23봄/CS372/project/dataset/train_df.csv', index = False) # test_df.to_csv('/content/drive/MyDrive/카이스트/23봄/CS372/project/dataset/test_df.csv', index = False) """## Removing unnecessary words""" import nltk nltk.download('book') from nltk.book import * from nltk.tokenize import sent_tokenize, word_tokenize def tokenize_POS(paragraph): words = word_tokenize(paragraph) tagged_words = nltk.pos_tag(words) # Remove not important types of words excluded_tags = ['CC', 'DT', 'IN', 'TO', 'PRP', 'PRP$', 'MD', 'WP', 'WP$', 'WRB'] filtered_words = [word for word, pos in tagged_words if pos not in excluded_tags] return ' '.join(filtered_words) # define the corpus to pull from train_corpus = train_df['abstract'].head(10000) test_corpus = test_df['abstract'].head(10000) train_corpus[0] train_corpus = pd.Series([tokenize_POS(abstract) for abstract in train_corpus]) train_corpus[0] train_df_doc = train_df train_df_doc['abstract'] = train_df_doc['abstract'].apply(tokenize_POS) """# TF-IDF""" import time # for getting the runtime of cells from sklearn.feature_extraction.text import TfidfVectorizer # for building word representations from sklearn.metrics.pairwise import cosine_similarity # for getting similarity metrics def get_recommendations_TFIDF(abstract): abstract = tokenize_POS(abstract) corpus = pd.concat([train_corpus, pd.Series(abstract)], ignore_index=True) # Initialize an instance of tf-idf Vectorizer tfidf_vectorizer = TfidfVectorizer() # Generate the tf-idf vectors for the corpus tfidf_matrix = tfidf_vectorizer.fit_transform(corpus) # compute and print the cosine similarity matrix cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) # Get the pairwise similarity scores sim_scores = list(enumerate(cosine_sim[-1])) sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) paper_indices = sim_scores[2][0] title = train_df['title'].iloc[paper_indices] categories = train_df['categories'].iloc[paper_indices] abstract = train_df['abstract'].iloc[paper_indices] similarity = "{:.2f}%".format(sim_scores[2][1] * 100) # Format similarity as a string with two decimal places and a percentage sign return title, categories, abstract, similarity get_recommendations_TFIDF(''' In this paper we consider permutations of sequences of partitions, obtaining\na result which parallels von Neumann's theorem on permutations of dense\nsequences and uniformly distributed sequences of points.\n ''') """# Doc2Vec""" import time from gensim.models.doc2vec import Doc2Vec, TaggedDocument from sklearn.metrics.pairwise import cosine_similarity def train_doc2vec_model(corpus, vector_size=100, window=5, min_count=1, epochs=100): # create tagged document object tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(corpus)] # words=word_tokenize(doc.lower()) # initialize doc2vec model model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs) # build vocabulary model.build_vocab(tagged_data) # train doc2vec model model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs) return model doc2vec_model = train_doc2vec_model(train_corpus) def get_recommendations_Doc2Vec(abstract): # remove unnecessary words abstract = tokenize_POS(abstract) # train the model model = doc2vec_model # infer the vector for the given abstract abstract_vector = model.infer_vector(abstract.split()) # get the most similar abstract most_similar = model.dv.most_similar([abstract_vector], topn=2) # Check if the first result is the input abstract if train_df_doc['abstract'].iloc[int(most_similar[0][0])].split() == abstract.split(): # print('True') paper_indices = int(most_similar[1][0]) similarity = "{:.2f}%".format(most_similar[1][1] * 100) # Format similarity as a string with two decimal places and a percentage sign else: # print('False') paper_indices = int(most_similar[0][0]) similarity = "{:.2f}%".format(most_similar[0][1] * 100) # Format similarity as a string with two decimal places and a percentage sign # Retrieve the details of the most similar abstract title = train_df['title'].iloc[paper_indices] categories = train_df['categories'].iloc[paper_indices] abstract = train_df['abstract'].iloc[paper_indices] return title, categories, abstract, similarity """# Deploy""" # !pip install gradio import gradio as gr def greet(paper): title_tfidf, categories_tfidf, abstract_tfidf, similarity_tfidf = get_recommendations_TFIDF(paper) title_doc2vec, categories_doc2vec, abstract_doc2vec, similarity_doc2vec = get_recommendations_Doc2Vec(paper) return title_tfidf, categories_tfidf, abstract_tfidf, similarity_tfidf, title_doc2vec, categories_doc2vec, abstract_doc2vec, similarity_doc2vec title = '''SimSearch:\n A Similarity Search Tool for Research Paper Abstracts using NLP''' demo = gr.Interface( title = title, fn = greet, inputs=gr.Textbox(placeholder='Abstract Here'), outputs=[gr.outputs.Textbox(label='''TFIDF Title'''), gr.outputs.Textbox(label='TFIDF Categories'), gr.outputs.Textbox(label='TFIDF Abstract'), gr.outputs.Textbox(label='TFIDF Similarity'), gr.outputs.Textbox(label='''Doc2Vec Title'''), gr.outputs.Textbox(label='Doc2Vec Categories'), gr.outputs.Textbox(label='Doc2Vec Abstract'), gr.outputs.Textbox(label='Doc2Vec Similarity'), ], examples = [ # ['''In this paper we consider permutations of sequences of partitions, obtaining\na result which parallels von Neumann's theorem on permutations of dense\nsequences and uniformly distributed sequences of points.\n'''], ['''A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, showing that enhanced sensitivity to the signal can be obtained with judicious selection of events.'''], ['''Most physical experiments are usually described repeated measurements random variables . experimental data registered on-line computers form time series outcomes . frequencies different outcomes are compared probabilities provided algorithms quantum theory ( QT ) . spite statistical predictions QT claim was made theory provided most complete description data underlying physical phenomena . claim be easily rejected fine structures , averaged out standard statistical descriptive analysis , were found time series experimental data . search structures one has use more subtle statistical tools which were developed study time series produced various stochastic processes . talk review tools . example show standard descriptive statistical analysis data is unable reveal fine structure simulated sample AR ( 2 ) stochastic process . emphasize once again violation Bell inequalities gives information completeness non locality QT . appropriate way test completeness quantum theory is search fine structures time series experimental data means purity tests studying autocorrelation partial autocorrelation functions .''']] ) # demo.launch(share=True, debug=True) demo.launch(debug=True)