Spaces:

ddiddu
/

simsearch

Runtime error

App Files Files Community

simsearch / app.py

ddiddu

add ngrams

9191e97 over 1 year ago

raw

history blame contribute delete

11.1 kB

	# -- coding: utf-8 --
	"""preprocess.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1cs4nwWCMLuQOS1f9V6xbAG1kKtasrPs2

	# Preprocessing
	"""

	# from google.colab import drive
	# drive.mount('/content/drive')

	import numpy as np
	import pandas as pd
	import json
	from itertools import islice # for slicing and dicing JSON records
	import os

	# def get_data(json_filename = 'arxiv-metadata-oai-snapshot.json', data_root = '/content/drive/MyDrive/카이스트/23봄/CS372/project'):
	# with open(data_root + '/' + 'dataset' + '/' + json_filename, "rb") as f:
	# for line in f:
	# yield line
	def get_data(json_filename='arxiv-metadata-oai-snapshot.json'):
	script_directory = os.path.dirname(os.path.abspath(__file__))
	json_path = os.path.join(script_directory, json_filename)

	with open(json_path, "rb") as f:
	for line in f:
	yield line

	data_gen = get_data()

	def get_records(data_gen, chunksize=500):
	return [json.loads(record) for record in islice(data_gen, chunksize)]

	records_per_chunk = 250000
	data_records = get_records(data_gen, records_per_chunk)

	def split_records(data_records, num_profiles=100, random_state=42):
	np.random.seed(random_state)
	np.random.shuffle(data_records)
	train_records, test_records = data_records[:num_profiles], data_records[num_profiles:]
	return train_records, test_records

	# Splitting the fetched records into train and test records
	train_records, test_records = split_records(data_records, num_profiles=500)

	# Utility method to generate dataframe from list of dictionaries
	def get_dataframe(list_of_dicts, columns=None):
	data = pd.DataFrame(list_of_dicts)
	if columns:
	data.columns = columns
	return data

	# Generating dataframes for train and test records
	train_df = get_dataframe(train_records)
	test_df = get_dataframe(test_records)

	# Utility method to filter out certain features which are of use
	def filter_features(data, features):
	return data[features]

	# Filtering the test dataframes for features we selected
	features = ['title', 'categories', 'abstract', 'update_date']
	train_df = filter_features(train_df, features)
	test_df = filter_features(test_df, features)

	# define the corpus to pull from
	train_corpus = train_df['abstract'].head(10000)
	test_corpus = test_df['abstract'].head(10000)

	train_df.head()

	test_df.head()

	# train_df.to_csv('/content/drive/MyDrive/카이스트/23봄/CS372/project/dataset/train_df.csv', index = False)
	# test_df.to_csv('/content/drive/MyDrive/카이스트/23봄/CS372/project/dataset/test_df.csv', index = False)

	"""## Removing unnecessary words"""

	import nltk
	nltk.download('book')
	from nltk.book import *
	from nltk.tokenize import sent_tokenize, word_tokenize

	def tokenize_POS(paragraph):
	words = word_tokenize(paragraph)
	tagged_words = nltk.pos_tag(words)

	# Remove not important types of words
	excluded_tags = ['CC', 'DT', 'IN', 'TO', 'PRP', 'PRP$', 'MD', 'WP', 'WP$', 'WRB']
	filtered_words = [word for word, pos in tagged_words if pos not in excluded_tags]

	return ' '.join(filtered_words)

	# define the corpus to pull from
	train_corpus = train_df['abstract'].head(10000)
	test_corpus = test_df['abstract'].head(10000)

	train_corpus[0]

	train_corpus = pd.Series([tokenize_POS(abstract) for abstract in train_corpus])

	train_corpus[0]

	train_df_doc = train_df

	train_df_doc['abstract'] = train_df_doc['abstract'].apply(tokenize_POS)

	"""# TF-IDF"""

	import time # for getting the runtime of cells
	from sklearn.feature_extraction.text import TfidfVectorizer # for building word representations
	from sklearn.metrics.pairwise import cosine_similarity # for getting similarity metrics

	def get_recommendations_TFIDF(abstract):
	abstract = tokenize_POS(abstract)
	corpus = pd.concat([train_corpus, pd.Series(abstract)], ignore_index=True)

	# Initialize an instance of tf-idf Vectorizer
	tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
	# Generate the tf-idf vectors for the corpus
	tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

	# compute and print the cosine similarity matrix
	cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

	# Get the pairwise similarity scores
	sim_scores = list(enumerate(cosine_sim[-1]))
	sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

	# Check if the first result is the input abstract
	if corpus[int(sim_scores[0][0])].split() == abstract.split() and corpus[int(sim_scores[1][0])].split() == abstract.split():
	print(corpus[int(sim_scores[0][0])].split() == abstract.split())
	print(corpus[int(sim_scores[1][0])].split() == abstract.split())
	paper_indices = int(sim_scores[2][0])
	similarity = "{:.2f}%".format(sim_scores[2][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
	elif sim_scores[0][0] == 500:
	paper_indices = int(sim_scores[1][0])
	similarity = "{:.2f}%".format(sim_scores[1][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
	else:
	paper_indices = int(sim_scores[0][0])
	similarity = "{:.2f}%".format(sim_scores[0][1] * 100) # Format similarity as a string with two decimal places and a percentage sign

	title = train_df['title'].iloc[paper_indices]
	categories = train_df['categories'].iloc[paper_indices]
	abstract = train_df['abstract'].iloc[paper_indices]

	return title, categories, abstract, similarity

	"""# Doc2Vec"""

	import time
	from gensim.models.doc2vec import Doc2Vec, TaggedDocument
	from sklearn.metrics.pairwise import cosine_similarity

	def train_doc2vec_model(corpus, vector_size=100, window=5, min_count=1, epochs=100):
	# create tagged document object
	tagged_data = [TaggedDocument(words=doc.split(), tags=[str(i)]) for i, doc in enumerate(corpus)] # words=word_tokenize(doc.lower())

	# initialize doc2vec model
	model = Doc2Vec(vector_size=vector_size, window=window, min_count=min_count, epochs=epochs)

	# build vocabulary
	model.build_vocab(tagged_data)

	# train doc2vec model
	model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
	return model

	doc2vec_model = train_doc2vec_model(train_corpus)

	def get_recommendations_Doc2Vec(abstract):
	# remove unnecessary words
	abstract = tokenize_POS(abstract)
	# train the model
	model = doc2vec_model

	# infer the vector for the given abstract
	abstract_vector = model.infer_vector(abstract.split())

	# get the most similar abstract
	most_similar = model.dv.most_similar([abstract_vector], topn=2)

	# Check if the first result is the input abstract
	if train_df_doc['abstract'].iloc[int(most_similar[0][0])].split() == abstract.split():
	# print('True')
	paper_indices = int(most_similar[1][0])
	similarity = "{:.2f}%".format(most_similar[1][1] * 100) # Format similarity as a string with two decimal places and a percentage sign
	else:
	# print('False')
	paper_indices = int(most_similar[0][0])
	similarity = "{:.2f}%".format(most_similar[0][1] * 100) # Format similarity as a string with two decimal places and a percentage sign

	# Retrieve the details of the most similar abstract
	title = train_df['title'].iloc[paper_indices]
	categories = train_df['categories'].iloc[paper_indices]
	abstract = train_df['abstract'].iloc[paper_indices]
	return title, categories, abstract, similarity

	"""# Deploy"""

	# !pip install gradio

	import gradio as gr

	def greet(paper):
	title_tfidf, categories_tfidf, abstract_tfidf, similarity_tfidf = get_recommendations_TFIDF(paper)
	title_doc2vec, categories_doc2vec, abstract_doc2vec, similarity_doc2vec = get_recommendations_Doc2Vec(paper)
	return title_tfidf, categories_tfidf, abstract_tfidf, similarity_tfidf, title_doc2vec, categories_doc2vec, abstract_doc2vec, similarity_doc2vec

	title = '''SimSearch:\n
	A Similarity Search Tool for Research Paper Abstracts using NLP'''

	demo = gr.Interface(
	title = title,
	fn = greet,
	inputs=gr.Textbox(placeholder='Abstract Here'),
	outputs=[gr.outputs.Textbox(label='''TFIDF Title'''),
	gr.outputs.Textbox(label='TFIDF Categories'),
	gr.outputs.Textbox(label='TFIDF Abstract'),
	gr.outputs.Textbox(label='TFIDF Similarity'),
	gr.outputs.Textbox(label='''Doc2Vec Title'''),
	gr.outputs.Textbox(label='Doc2Vec Categories'),
	gr.outputs.Textbox(label='Doc2Vec Abstract'),
	gr.outputs.Textbox(label='Doc2Vec Similarity'),
	],
	examples = [
	# ['''In this paper we consider permutations of sequences of partitions, obtaining\na result which parallels von Neumann's theorem on permutations of dense\nsequences and uniformly distributed sequences of points.\n'''],
	['''A fully differential calculation in perturbative quantum chromodynamics is presented for the production of massive photon pairs at hadron colliders. All next-to-leading order perturbative contributions from quark-antiquark, gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as all-orders resummation of initial-state gluon radiation valid at next-to-next-to-leading logarithmic accuracy. The region of phase space is specified in which the calculation is most reliable. Good agreement is demonstrated with data from the Fermilab Tevatron, and predictions are made for more detailed tests with CDF and DO data. Predictions are shown for distributions of diphoton pairs produced at the energy of the Large Hadron Collider (LHC). Distributions of the diphoton pairs from the decay of a Higgs boson are contrasted with those produced from QCD processes at the LHC, showing that enhanced sensitivity to the signal can be obtained with judicious selection of events.'''],
	['''Most physical experiments are usually described repeated measurements random variables . experimental data registered on-line computers form time series outcomes . frequencies different outcomes are compared probabilities provided algorithms quantum theory ( QT ) . spite statistical predictions QT claim was made theory provided most complete description data underlying physical phenomena . claim be easily rejected fine structures , averaged out standard statistical descriptive analysis , were found time series experimental data . search structures one has use more subtle statistical tools which were developed study time series produced various stochastic processes . talk review tools . example show standard descriptive statistical analysis data is unable reveal fine structure simulated sample AR ( 2 ) stochastic process . emphasize once again violation Bell inequalities gives information completeness non locality QT . appropriate way test completeness quantum theory is search fine structures time series experimental data means purity tests studying autocorrelation partial autocorrelation functions .''']]
	)

	# demo.launch(share=True, debug=True)
	demo.launch(debug=True)