PatentSolver

Build error

App Files Files Community

PatentSolver / Word2vec /run.py

RedBaron5

Duplicate from xin/PatentSolver

c13b805 almost 2 years ago

raw

history blame contribute delete

2.23 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --
	# @File : test_sentence_similarity.py
	# @Author: nixin
	# @Date : 2019-03-06

	import numpy as np
	from scipy import spatial
	from gensim.models import word2vec
	import pandas as pd



	# load the trained word vector model
	model = word2vec.Word2Vec.load('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/trained_word2vec.model')
	index2word_set = set(model.wv.index2word)

	def avg_feature_vector(sentence, model, num_features, index2word_set):
	words = sentence.split()
	feature_vec = np.zeros((num_features, ), dtype='float32')
	n_words = 0
	for word in words:
	if word in index2word_set:
	n_words += 1
	feature_vec = np.add(feature_vec, model[word])
	if (n_words > 0):
	feature_vec = np.divide(feature_vec, n_words)
	return feature_vec

	#read problem file
	problem_corpus = pd.read_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/data_problem_corpus/problem_corpus_sample_cleaned.csv')
	problem_corpus = problem_corpus.head(100)

	target_problem = 'strategic cleavage of such a target rna will destroy its ability to direct synthesis of an encoded protein'
	target_domain = 'A'

	# remove the same domain's problems
	problem_corpus = problem_corpus[problem_corpus.Domain != 'A']


	# choose the time range
	problem_corpus = problem_corpus[problem_corpus['publication_year'].between(2015, 2017)]


	value=[]
	for each_problem in problem_corpus['First part Contradiction']:
	s1_afv = avg_feature_vector(target_problem, model=model, num_features=100, index2word_set=index2word_set)
	s2_afv = avg_feature_vector(each_problem, model=model, num_features=100, index2word_set=index2word_set)
	sim_value = format( 1 - spatial.distance.cosine(s1_afv, s2_afv), '.2f')
	value.append(sim_value)

	problem_corpus[['similarity_value', 'target_problem']] = value, target_problem

	print(problem_corpus)

	# set similarity threshold
	problem_corpus_final = problem_corpus[problem_corpus.similarity_value>= '0.8']
	# print(problem_corpus.columns())

	problem_corpus_final.to_csv('/Users/nixin/PycharmProjects/PatentSolver_demonstrator/Word2vec/simialrity_result/test.csv', index=False)
	print(problem_corpus_final)