simsearch / app.py
ddiddu's picture
add ngrams
9191e97
# -*- coding: utf-8 -*-
"""preprocess.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1cs4nwWCMLuQOS1f9V6xbAG1kKtasrPs2
# Preprocessing
"""
# from google.colab import drive
# drive.mount('/content/drive')
import numpy as np
import pandas as pd
import json
from itertools import islice # for slicing and dicing JSON records
import os
# def get_data(json_filename = 'arxiv-metadata-oai-snapshot.json', data_root = '/content/drive/MyDrive/카이스트/23봄/CS372/project'):
# with open(data_root + '/' + 'dataset' + '/' + json_filename, "rb") as f:
# for line in f:
# yield line
def get_data(json_filename='arxiv-metadata-oai-snapshot.json'):
script_directory = os.path.dirname(os.path.abspath(__file__))
json_path = os.path.join(script_directory, json_filename)
with open(json_path, "rb") as f:
for line in f:
yield line
data_gen = get_data()
def get_records(data_gen, chunksize=500):
return [json.loads(record) for record in islice(data_gen, chunksize)]
records_per_chunk = 250000
data_records = get_records(data_gen, records_per_chunk)
def split_records(data_records, num_profiles=100, random_state=42):
np.random.seed(random_state)
np.random.shuffle(data_records)
train_records, test_records = data_records[:num_profiles], data_records[num_profiles:]
return train_records, test_records
# Splitting the fetched records into train and test records
train_records, test_records = split_records(data_records, num_profiles=500)
# Utility method to generate dataframe from list of dictionaries
def get_dataframe(list_of_dicts, columns=None):
data = pd.DataFrame(list_of_dicts)
if columns:
data.columns = columns
return data
# Generating dataframes for train and test records
train_df = get_dataframe(train_records)
test_df = get_dataframe(test_records)
# Utility method to filter out certain features which are of use
def filter_features(data, features):
return data[features]
# Filtering the test dataframes for features we selected
features = ['title', 'categories', 'abstract', 'update_date']
train_df = filter_features(train_df, features)
test_df = filter_features(test_df, features)
# define the corpus to pull from
train_corpus = train_df['abstract'].head(10000)
test_corpus = test_df['abstract'].head(10000)
train_df.head()
test_df.head()
# train_df.to_csv('/content/drive/MyDrive/카이스트/23봄/CS372/project/dataset/train_df.csv', index = False)
# test_df.to_csv('/content/drive/MyDrive/카이스트/23봄/CS372/project/dataset/test_df.csv', index = False)
"""## Removing unnecessary words"""
import nltk
nltk.download('book')
from nltk.book import *
from nltk.tokenize import sent_tokenize, word_tokenize
def tokenize_POS(paragraph):
words = word_tokenize(paragraph)
tagged_words = nltk.pos_tag(words)
# Remove not important types of words
excluded_tags = ['CC', 'DT', 'IN', 'TO', 'PRP', 'PRP$', 'MD', 'WP', 'WP$', 'WRB']
filtered_words = [word for word, pos in tagged_words if pos not in excluded_tags]
return ' '.join(filtered_words)
# define the corpus to pull from
train_corpus = train_df['abstract'].head(10000)
test_corpus = test_df['abstract'].head(10000)
train_corpus[0]
train_corpus = pd.Series([tokenize_POS(abstract) for abstract in train_corpus])
train_corpus[0]
train_df_doc = train_df
train_df_doc['abstract'] = train_df_doc['abstract'].apply(tokenize_POS)
"""# TF-IDF"""
import time # for getting the runtime of cells
from sklearn.feature_extraction.text import TfidfVectorizer # for building word representations
from sklearn.metrics.pairwise import cosine_similarity # for getting similarity metrics
def get_recommendations_TFIDF(abstract):
abstract = tokenize_POS(abstract)
corpus = pd.concat([train_corpus, pd.Series(abstract)], ignore_index=True)
# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
# compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# Get the pairwise similarity scores
sim_scores = list(enumerate(cosine_sim[-1]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
# Check if the first result is the input abstract
if corpus[int(sim_scores[0][0])].split() == abstract.split() and corpus[int(sim_scores[1][0])].split() == abstract.split():
print(corpus[int(sim_scores[0][0])].split() == abstract.split())
print(corpus[int(sim_scores[1][0])].split() == abstract.split())
paper_indices = int(sim_scores[2][0])
similarity = "{:.2f}%".format(sim_scores[2][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
elif sim_scores[0][0] == 500:
paper_indices = int(sim_scores[1][0])
similarity = "{:.2f}%".format(sim_scores[1][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
else:
paper_indices = int(sim_scores[0][0])
similarity = "{:.2f}%".format(sim_scores[0][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
title = train_df['title'].iloc[paper_indices]
categories = train_df['categories'].iloc[paper_indices]
abstract = train_df['abstract'].iloc[paper_indices]
return title, categories, abstract, similarity
"""# Doc2Vec"""
import time
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
def train_doc2vec_model(corpus, vector_size=100, window=5, min_count=1, epochs=100):
# create tagged document object
tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(corpus)] # words=word_tokenize(doc.lower())
# initialize doc2vec model
model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)
# build vocabulary
model.build_vocab(tagged_data)
# train doc2vec model
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
return model
doc2vec_model = train_doc2vec_model(train_corpus)
def get_recommendations_Doc2Vec(abstract):
# remove unnecessary words
abstract = tokenize_POS(abstract)
# train the model
model = doc2vec_model
# infer the vector for the given abstract
abstract_vector = model.infer_vector(abstract.split())
# get the most similar abstract
most_similar = model.dv.most_similar([abstract_vector], topn=2)
# Check if the first result is the input abstract
if train_df_doc['abstract'].iloc[int(most_similar[0][0])].split() == abstract.split():
# print('True')
paper_indices = int(most_similar[1][0])
similarity = "{:.2f}%".format(most_similar[1][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
else:
# print('False')
paper_indices = int(most_similar[0][0])
similarity = "{:.2f}%".format(most_similar[0][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
# Retrieve the details of the most similar abstract
title = train_df['title'].iloc[paper_indices]
categories = train_df['categories'].iloc[paper_indices]
abstract = train_df['abstract'].iloc[paper_indices]
return title, categories, abstract, similarity
"""# Deploy"""
# !pip install gradio
import gradio as gr
def greet(paper):
title_tfidf, categories_tfidf, abstract_tfidf, similarity_tfidf = get_recommendations_TFIDF(paper)
title_doc2vec, categories_doc2vec, abstract_doc2vec, similarity_doc2vec = get_recommendations_Doc2Vec(paper)
return title_tfidf, categories_tfidf, abstract_tfidf, similarity_tfidf, title_doc2vec, categories_doc2vec, abstract_doc2vec, similarity_doc2vec
title = '''SimSearch:\n
A Similarity Search Tool for Research Paper Abstracts using NLP'''
demo = gr.Interface(
title = title,
fn = greet,
inputs=gr.Textbox(placeholder='Abstract Here'),
outputs=[gr.outputs.Textbox(label='''TFIDF Title'''),
gr.outputs.Textbox(label='TFIDF Categories'),
gr.outputs.Textbox(label='TFIDF Abstract'),
gr.outputs.Textbox(label='TFIDF Similarity'),
gr.outputs.Textbox(label='''Doc2Vec Title'''),
gr.outputs.Textbox(label='Doc2Vec Categories'),
gr.outputs.Textbox(label='Doc2Vec Abstract'),
gr.outputs.Textbox(label='Doc2Vec Similarity'),
],
examples = [
# ['''In this paper we consider permutations of sequences of partitions, obtaining\na result which parallels von Neumann's theorem on permutations of dense\nsequences and uniformly distributed sequences of points.\n'''],
['''A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, showing that enhanced sensitivity to the signal can be obtained with judicious selection of events.'''],
['''Most physical experiments are usually described repeated measurements random variables . experimental data registered on-line computers form time series outcomes . frequencies different outcomes are compared probabilities provided algorithms quantum theory ( QT ) . spite statistical predictions QT claim was made theory provided most complete description data underlying physical phenomena . claim be easily rejected fine structures , averaged out standard statistical descriptive analysis , were found time series experimental data . search structures one has use more subtle statistical tools which were developed study time series produced various stochastic processes . talk review tools . example show standard descriptive statistical analysis data is unable reveal fine structure simulated sample AR ( 2 ) stochastic process . emphasize once again violation Bell inequalities gives information completeness non locality QT . appropriate way test completeness quantum theory is search fine structures time series experimental data means purity tests studying autocorrelation partial autocorrelation functions .''']]
)
# demo.launch(share=True, debug=True)
demo.launch(debug=True)