|
|
|
"""preprocess.ipynb |
|
|
|
Automatically generated by Colaboratory. |
|
|
|
Original file is located at |
|
https://colab.research.google.com/drive/1cs4nwWCMLuQOS1f9V6xbAG1kKtasrPs2 |
|
|
|
# Preprocessing |
|
""" |
|
|
|
|
|
|
|
|
|
import numpy as np |
|
import pandas as pd |
|
import json |
|
from itertools import islice |
|
import os |
|
|
|
|
|
|
|
|
|
|
|
def get_data(json_filename='arxiv-metadata-oai-snapshot.json'): |
|
script_directory = os.path.dirname(os.path.abspath(__file__)) |
|
json_path = os.path.join(script_directory, json_filename) |
|
|
|
with open(json_path, "rb") as f: |
|
for line in f: |
|
yield line |
|
|
|
data_gen = get_data() |
|
|
|
def get_records(data_gen, chunksize=500): |
|
return [json.loads(record) for record in islice(data_gen, chunksize)] |
|
|
|
records_per_chunk = 250000 |
|
data_records = get_records(data_gen, records_per_chunk) |
|
|
|
def split_records(data_records, num_profiles=100, random_state=42): |
|
np.random.seed(random_state) |
|
np.random.shuffle(data_records) |
|
train_records, test_records = data_records[:num_profiles], data_records[num_profiles:] |
|
return train_records, test_records |
|
|
|
|
|
train_records, test_records = split_records(data_records, num_profiles=500) |
|
|
|
|
|
def get_dataframe(list_of_dicts, columns=None): |
|
data = pd.DataFrame(list_of_dicts) |
|
if columns: |
|
data.columns = columns |
|
return data |
|
|
|
|
|
train_df = get_dataframe(train_records) |
|
test_df = get_dataframe(test_records) |
|
|
|
|
|
def filter_features(data, features): |
|
return data[features] |
|
|
|
|
|
features = ['title', 'categories', 'abstract', 'update_date'] |
|
train_df = filter_features(train_df, features) |
|
test_df = filter_features(test_df, features) |
|
|
|
|
|
train_corpus = train_df['abstract'].head(10000) |
|
test_corpus = test_df['abstract'].head(10000) |
|
|
|
train_df.head() |
|
|
|
test_df.head() |
|
|
|
|
|
|
|
|
|
"""## Removing unnecessary words""" |
|
|
|
import nltk |
|
nltk.download('book') |
|
from nltk.book import * |
|
from nltk.tokenize import sent_tokenize, word_tokenize |
|
|
|
def tokenize_POS(paragraph): |
|
words = word_tokenize(paragraph) |
|
tagged_words = nltk.pos_tag(words) |
|
|
|
|
|
excluded_tags = ['CC', 'DT', 'IN', 'TO', 'PRP', 'PRP$', 'MD', 'WP', 'WP$', 'WRB'] |
|
filtered_words = [word for word, pos in tagged_words if pos not in excluded_tags] |
|
|
|
return ' '.join(filtered_words) |
|
|
|
|
|
train_corpus = train_df['abstract'].head(10000) |
|
test_corpus = test_df['abstract'].head(10000) |
|
|
|
train_corpus[0] |
|
|
|
train_corpus = pd.Series([tokenize_POS(abstract) for abstract in train_corpus]) |
|
|
|
train_corpus[0] |
|
|
|
train_df_doc = train_df |
|
|
|
train_df_doc['abstract'] = train_df_doc['abstract'].apply(tokenize_POS) |
|
|
|
"""# TF-IDF""" |
|
|
|
import time |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
def get_recommendations_TFIDF(abstract): |
|
abstract = tokenize_POS(abstract) |
|
corpus = pd.concat([train_corpus, pd.Series(abstract)], ignore_index=True) |
|
|
|
|
|
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2)) |
|
|
|
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus) |
|
|
|
|
|
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix) |
|
|
|
|
|
sim_scores = list(enumerate(cosine_sim[-1])) |
|
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) |
|
|
|
|
|
if corpus[int(sim_scores[0][0])].split() == abstract.split() and corpus[int(sim_scores[1][0])].split() == abstract.split(): |
|
print(corpus[int(sim_scores[0][0])].split() == abstract.split()) |
|
print(corpus[int(sim_scores[1][0])].split() == abstract.split()) |
|
paper_indices = int(sim_scores[2][0]) |
|
similarity = "{:.2f}%".format(sim_scores[2][1] * 100) |
|
elif sim_scores[0][0] == 500: |
|
paper_indices = int(sim_scores[1][0]) |
|
similarity = "{:.2f}%".format(sim_scores[1][1] * 100) |
|
else: |
|
paper_indices = int(sim_scores[0][0]) |
|
similarity = "{:.2f}%".format(sim_scores[0][1] * 100) |
|
|
|
title = train_df['title'].iloc[paper_indices] |
|
categories = train_df['categories'].iloc[paper_indices] |
|
abstract = train_df['abstract'].iloc[paper_indices] |
|
|
|
return title, categories, abstract, similarity |
|
|
|
"""# Doc2Vec""" |
|
|
|
import time |
|
from gensim.models.doc2vec import Doc2Vec, TaggedDocument |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
def train_doc2vec_model(corpus, vector_size=100, window=5, min_count=1, epochs=100): |
|
|
|
tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(corpus)] |
|
|
|
|
|
model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs) |
|
|
|
|
|
model.build_vocab(tagged_data) |
|
|
|
|
|
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs) |
|
return model |
|
|
|
doc2vec_model = train_doc2vec_model(train_corpus) |
|
|
|
def get_recommendations_Doc2Vec(abstract): |
|
|
|
abstract = tokenize_POS(abstract) |
|
|
|
model = doc2vec_model |
|
|
|
|
|
abstract_vector = model.infer_vector(abstract.split()) |
|
|
|
|
|
most_similar = model.dv.most_similar([abstract_vector], topn=2) |
|
|
|
|
|
if train_df_doc['abstract'].iloc[int(most_similar[0][0])].split() == abstract.split(): |
|
|
|
paper_indices = int(most_similar[1][0]) |
|
similarity = "{:.2f}%".format(most_similar[1][1] * 100) |
|
else: |
|
|
|
paper_indices = int(most_similar[0][0]) |
|
similarity = "{:.2f}%".format(most_similar[0][1] * 100) |
|
|
|
|
|
title = train_df['title'].iloc[paper_indices] |
|
categories = train_df['categories'].iloc[paper_indices] |
|
abstract = train_df['abstract'].iloc[paper_indices] |
|
return title, categories, abstract, similarity |
|
|
|
"""# Deploy""" |
|
|
|
|
|
|
|
import gradio as gr |
|
|
|
def greet(paper): |
|
title_tfidf, categories_tfidf, abstract_tfidf, similarity_tfidf = get_recommendations_TFIDF(paper) |
|
title_doc2vec, categories_doc2vec, abstract_doc2vec, similarity_doc2vec = get_recommendations_Doc2Vec(paper) |
|
return title_tfidf, categories_tfidf, abstract_tfidf, similarity_tfidf, title_doc2vec, categories_doc2vec, abstract_doc2vec, similarity_doc2vec |
|
|
|
title = '''SimSearch:\n |
|
A Similarity Search Tool for Research Paper Abstracts using NLP''' |
|
|
|
demo = gr.Interface( |
|
title = title, |
|
fn = greet, |
|
inputs=gr.Textbox(placeholder='Abstract Here'), |
|
outputs=[gr.outputs.Textbox(label='''TFIDF Title'''), |
|
gr.outputs.Textbox(label='TFIDF Categories'), |
|
gr.outputs.Textbox(label='TFIDF Abstract'), |
|
gr.outputs.Textbox(label='TFIDF Similarity'), |
|
gr.outputs.Textbox(label='''Doc2Vec Title'''), |
|
gr.outputs.Textbox(label='Doc2Vec Categories'), |
|
gr.outputs.Textbox(label='Doc2Vec Abstract'), |
|
gr.outputs.Textbox(label='Doc2Vec Similarity'), |
|
], |
|
examples = [ |
|
|
|
['''A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, showing that enhanced sensitivity to the signal can be obtained with judicious selection of events.'''], |
|
['''Most physical experiments are usually described repeated measurements random variables . experimental data registered on-line computers form time series outcomes . frequencies different outcomes are compared probabilities provided algorithms quantum theory ( QT ) . spite statistical predictions QT claim was made theory provided most complete description data underlying physical phenomena . claim be easily rejected fine structures , averaged out standard statistical descriptive analysis , were found time series experimental data . search structures one has use more subtle statistical tools which were developed study time series produced various stochastic processes . talk review tools . example show standard descriptive statistical analysis data is unable reveal fine structure simulated sample AR ( 2 ) stochastic process . emphasize once again violation Bell inequalities gives information completeness non locality QT . appropriate way test completeness quantum theory is search fine structures time series experimental data means purity tests studying autocorrelation partial autocorrelation functions .''']] |
|
) |
|
|
|
|
|
demo.launch(debug=True) |