Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# @File : 7.demo_app.py | |
# @Author: nixin | |
# @Date : 2021/11/27 | |
from PIL import Image | |
import time | |
import datetime as datetime | |
from scipy import spatial | |
from gensim.models import word2vec | |
from keras.models import load_model | |
from LSTM.config import siamese_config | |
from LSTM.inputHandler import create_test_data, word_embed_meta_data | |
from simpletransformers.question_answering import QuestionAnsweringModel | |
from functools import partial | |
from functions import * | |
from skcriteria import Data, MAX, MIN | |
from skcriteria.madm import simple, closeness | |
#===================# | |
# Streamlit code | |
#===================# | |
# st.title('PatentSolver') | |
st.markdown("<h1 style='text-align: center; color: orange;'>PatentSolver</h1>", unsafe_allow_html=True) | |
image = Image.open('profile.png') | |
col1,mid, col2 = st.columns([50,10,30]) | |
with col1: | |
st.header('Achieve inventive ideas from U.S. Patents.') | |
with col2: | |
st.image(image, width=150) | |
st.write('๐ This demo app aims to explore latent inventive solutions from different domain U.S. patents.') | |
st.write('๐ Click on top left corner button โก๏ธ to start.') | |
st.caption('๐ค๏ธ According to natural language processing-related techniques associated with semantic similarity computation, question answering system, and multiple criteria decision analysis,' | |
' this demo app is finally here.') | |
st.caption('๐ผ Introduction video: https://youtu.be/asDsOCuFprQ') | |
st.caption('๐ง Please play it and send us feedback (nxnixin at gmail.com) since it is still very young :)') | |
add_selectbox = st.sidebar.selectbox( | |
"Which function would you like to choose?", | |
('Start from the following options',"1. Patent details scraper", "2. Prepare patents (.txt) ", "3. Extract problems from patents", "4. Similar problem extractor", "5. Problem-solution matching", "6. Inventive solutions ranking") | |
) | |
#===================# | |
# Function 1 | |
#===================# | |
if add_selectbox == '1. Patent details scraper': | |
# st.title('PatentSolver_patent details') | |
app_target = "To scrape details of the given U.S. patents" | |
st.subheader(app_target) | |
# user types the inputs | |
user_input_patent_number = st.text_input('Type patent number') | |
st.caption('1. use "," to separate if many. 2. please delete previous inputs ' | |
'when change or add new patents. 3. Google patent search web: https://patents.google.com/ ' | |
'4. E.g. US10393039B2, US9533047, US8755039B2') | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
# ~~~ prepare patents ~~~ # | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
if st.button('Run'): | |
with st.spinner('Wait for it...'): | |
start_time = time.time() | |
list_of_patents = patentinput( user_input_patent_number) | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
# ~~~ Parameters for data_patent_details file ~~~ # | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
path_to_data = "data_patent_details/" #### don't forget to change | |
## Create csv file to store the data_patent_details from the patent runs | |
# (1) Specify column order of patents | |
# (2) Create csv if it does not exist in the data_patent_details path | |
data_column_order = ['inventor_name', | |
'assignee_name_orig', | |
'assignee_name_current', | |
'pub_date', | |
'priority_date', | |
'grant_date', | |
'filing_date', | |
'forward_cite_no_family', | |
'forward_cite_yes_family', | |
'backward_cite_no_family', | |
'backward_cite_yes_family', | |
'patent', | |
'url', | |
'abstract_text'] | |
if 'edison_patents.csv' in os.listdir(path_to_data): | |
os.remove( path_to_data + 'edison_patents.csv') # delete previous csv file | |
with open(path_to_data + 'edison_patents.csv','w',newline='') as file: | |
writer = csv.writer(file) | |
writer.writerow(data_column_order) | |
else: | |
with open(path_to_data + 'edison_patents.csv','w',newline='') as file: | |
writer = csv.writer(file) | |
writer.writerow(data_column_order) | |
# | |
# | |
########### Run pool process ############# | |
if __name__ == "__main__": | |
## Create lock to prevent collisions when processes try to write on same file | |
l = mp.Lock() | |
## Use a pool of workers where the number of processes is equal to | |
## the number of cpus - 1 | |
with poolcontext(processes=mp.cpu_count()-1,initializer=init,initargs=(l,)) as pool: | |
pool.map(partial(single_process_scraper,path_to_data_file=path_to_data + 'edison_patents.csv', | |
data_column_order=data_column_order), | |
list_of_patents) | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
# ~~~ clean raw data_patent_details ~~~ # | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
##read Google scrawer's results | |
table = pd.read_csv('data_patent_details/edison_patents.csv') | |
# clean raw patent results | |
results = clean_patent(table) | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
# ~~~ count number ~~~ # | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
results = count_patent(results) | |
st.success('Done!') | |
st.write("Process is finished within %s seconds" % round(time.time() - start_time, 2)) | |
# function of running | |
# if st.button('Run'): | |
st.dataframe(results) | |
csv = convert_df(results) # to download results | |
st.download_button( | |
label="Download", | |
data=csv, | |
file_name='results.csv', | |
mime='text/csv', | |
) | |
#===================# | |
# Function 2 | |
#===================# | |
elif add_selectbox == '2. Prepare patents (.txt) ': | |
file_path_saved = 'patent_text/' | |
app_target = "To convert patents (.xml) file to patents (.txt) file" | |
st.subheader(app_target) | |
st.caption( | |
'๐ฅ Please firstly choose "Patent Grant Full Text Data (No Images)" from https://developer.uspto.gov/product/patent-grant-full-text-dataxml to download U.S. patents (.xml) you want.') | |
uploaded_files = st.file_uploader("Choose U.S. patent files", type='XML', accept_multiple_files=True) | |
if st.button('run'): | |
with st.spinner('Wait for it...'): | |
start_time = time.time() | |
path = os.listdir('patent_text/') | |
if len(path) == 0: | |
print("Directory is empty") | |
for uploaded_file in uploaded_files: | |
XMLtoTEXT(patent_xml=uploaded_file, saved_file_path=file_path_saved) | |
else: | |
print("Directory is not empty") | |
files = glob.glob('patent_text/*') | |
for each in files: | |
os.remove(each) # remove previous files | |
for uploaded_file in uploaded_files: | |
XMLtoTEXT(patent_xml=uploaded_file, saved_file_path=file_path_saved) | |
path = os.listdir('patent_text/') | |
st.write(path) | |
st.success('Done!') | |
st.write("Process is finished within %s seconds" % round(time.time() - start_time, 2)) | |
# download patents (txt) by zip file | |
create_download_zip(zip_directory='patent_text', | |
zip_path='zip_file/', | |
filename='US_patents') | |
#===================# | |
# Function 3 | |
#===================# | |
elif add_selectbox == '3. Extract problems from patents': | |
app_target = "To extract problems from patents" | |
st.subheader(app_target) | |
st.caption('๐จ Please choose one or several patents (from Function 2).') | |
uploaded_files = st.file_uploader("Choose U.S. patents", type='txt', accept_multiple_files=True) | |
print(uploaded_files) | |
# check the folder is empty or not | |
if len(os.listdir('Data/input/US_patents')) == 0: | |
print("Directory is empty") | |
# save uploaded files into the folder(//input/US_patents) | |
for f in uploaded_files: | |
if uploaded_files is not None: | |
save_uploadedfile(f) | |
else: | |
print("Directory is not empty") | |
files = glob.glob('Data/input/US_patents/*') | |
for each in files: | |
os.remove(each) #remove previous files | |
# save uploaded files into the folder(//input/US_patents) | |
for f in uploaded_files: | |
if uploaded_files is not None: | |
save_uploadedfile(f) | |
if st.button('Extract'): | |
with st.spinner('Wait for it...'): | |
start_time = time.time() | |
extractor('US_patents') #extract problems from this folder (//US_patents) | |
st.success('Done!') | |
st.write("Process is finished within %s seconds" % round(time.time() - start_time, 2)) | |
table = extract_info_text() | |
st.dataframe(table) | |
csv = convert_df(table) #to download problem results | |
st.download_button( | |
label="Download", | |
data = csv, | |
file_name = 'results.csv', | |
mime = 'text/csv', | |
) | |
# ===================# | |
# Function 4 | |
# ===================# | |
elif add_selectbox == '4. Similar problem extractor': | |
app_target = "To extract similar problems from different domains U.S. patents" | |
st.subheader(app_target) | |
st.caption('๐จโ๐ป Please type one target problem you want from Function 3.') | |
# user types the inputs | |
user_input_patent_sentence = st.text_input('Type one patent problem sentence') | |
# choose patent domain | |
select_domain = st.selectbox('Which domain it belongs to?', | |
['A (Human necessities)', 'B (Performing operations; transporting)', 'C (Chemistry; metallurgy)','D (Textiles; paper)', 'E (Fixed constructions)', 'F (Mechanical engineering; lighting; heating; weapons; blasting engines or pumps','G (Physics)',' H (Electricity)']) | |
user_input_domain = input_domain(select_domain) #get domain lable like A B C | |
# choose one of trained models | |
select_model = st.selectbox('Which model do you want?', | |
['IDM-Similar', 'SAM-IDM']) | |
st.caption('1. โ๏ธ IDM-Similar based on Word2vec neural networks \n 2. โ๏ธ SAM-IDM based on LSTM neural networks') | |
# the function of choosing time period for comparied problems | |
choose_time_range = st.date_input("Time Period", [datetime.date(2019, 5, 1), datetime.date(2019, 5, 31)]) | |
start = datetime.datetime.combine(choose_time_range[0], datetime.datetime.min.time()) #recevie the input of start time | |
end = datetime.datetime.combine(choose_time_range[1], datetime.datetime.min.time()) #recevie the input of end time | |
st.caption('1. ๐ฅฑ The longer time period will result in the longer waiting time. Suggest one month. \n ' | |
'2. ๐ The problem sample corpus is from 2006-2020 year, please choose among this period. ') | |
start_year = int(start.strftime("%Y")) | |
start_month = int(start.strftime("%m")) | |
end_year = int(end.strftime("%Y")) | |
end_month = int(end.strftime("%m")) | |
if select_model== 'IDM-Similar': | |
select_threshold = st.slider('Similarity Threshold:', 0.6, 1.0, 0.8) | |
else: | |
select_threshold = st.slider('Similarity Threshold:', 0.1, 1.0, 0.2) | |
if select_model == 'IDM-Similar': #user chooses IDM-Similar | |
if st.button('Run'): | |
with st.spinner('Wait for it...'): | |
start_time = time.time() | |
################################ | |
# IDM-Similar model (Word2vec) | |
################################ | |
# load the trained word vector model | |
model = word2vec.Word2Vec.load('Word2vec/trained_word2vec.model') | |
index2word_set = set(model.wv.index2word) | |
#read problem patent corpus | |
problem_corpus = pd.read_csv('data_problem_corpus/problem_corpus_full_cleaned.csv') | |
# problem_corpus = problem_corpus.head(500) | |
print('--------------------') | |
print(problem_corpus.columns) | |
print('--------------------') | |
target_problem = user_input_patent_sentence | |
target_domain = user_input_domain | |
# remove the same domain's problems | |
problem_corpus = problem_corpus[problem_corpus.Domain != target_domain] | |
# choose the month period | |
problem_corpus = choosing_month_period(problem_corpus = problem_corpus, start_year = start_year, | |
end_year = end_year, start_month = start_month, end_month = end_month) | |
print(problem_corpus) | |
print(problem_corpus.columns) | |
print('=======') | |
# compute the similarity value | |
value_1=[] | |
for each_problem in problem_corpus['First part Contradiction']: | |
s1_afv = avg_feature_vector(target_problem, model=model, num_features=100, index2word_set=index2word_set) | |
s2_afv = avg_feature_vector(each_problem, model=model, num_features=100, index2word_set=index2word_set) | |
sim_value = format( 1 - spatial.distance.cosine(s1_afv, s2_afv), '.2f') | |
value_1.append(sim_value) | |
print("++++++++++") | |
problem_corpus[['similarity_value_1', 'target_problem']] = value_1, target_problem | |
value_2=[] | |
for each_problem in problem_corpus['Second part Contradiction']: | |
s1_afv = avg_feature_vector(target_problem, model=model, num_features=100, index2word_set=index2word_set) | |
s2_afv = avg_feature_vector(each_problem, model=model, num_features=100, index2word_set=index2word_set) | |
sim_value = format( 1 - spatial.distance.cosine(s1_afv, s2_afv), '.2f') | |
value_2.append(sim_value) | |
problem_corpus['similarity_value_2'] = value_2 | |
print("++++++++++") | |
print(problem_corpus) | |
print(problem_corpus.columns) | |
print("++++++++++") | |
problem_corpus_1 = problem_corpus[['patent_number', 'Domain', 'First part Contradiction', 'publication_date', 'publication_year','publication_month', 'label', 'similarity_value_1', 'target_problem']] | |
problem_corpus_1 = problem_corpus_1.rename(columns = {'First part Contradiction': 'problem', 'similarity_value_1' : 'similarity_value'}) | |
problem_corpus_2 = problem_corpus[ | |
['patent_number', 'Domain', 'Second part Contradiction', 'publication_date', 'publication_year', 'publication_month', 'label', | |
'similarity_value_2', 'target_problem']] | |
problem_corpus_2 = problem_corpus_2.rename(columns={'Second part Contradiction': 'problem', 'similarity_value_2' : 'similarity_value'}) | |
problem_corpus_final = pd.concat([problem_corpus_1, problem_corpus_2], ignore_index=True, sort=False) | |
print(problem_corpus_final) | |
print(problem_corpus_final.columns) | |
print(type(select_threshold)) | |
print(select_threshold) | |
problem_corpus_final.to_csv('result_test.csv',index=False) | |
print('=================') | |
# choose the resutls that are bigger than the similarity threshold | |
problem_corpus_final = problem_corpus_final[problem_corpus_final['similarity_value'].astype(str)>= str(select_threshold)] | |
problem_corpus_final= problem_corpus_final[['patent_number', 'Domain','problem', 'similarity_value', 'target_problem']] | |
# dropping duplicate values | |
problem_corpus_final = problem_corpus_final.drop_duplicates(ignore_index=True) | |
problem_corpus_final.to_csv('Word2vec/simialrity_result/test.csv', index=False) | |
print(problem_corpus_final) | |
st.success('Done!') | |
st.write("Process is finished within %s seconds" % round(time.time() - start_time, 2)) | |
# show results | |
st.dataframe(problem_corpus_final) | |
csv = convert_df(problem_corpus_final) # to download results | |
st.download_button( | |
label="Download", | |
data=csv, | |
file_name='results.csv', | |
mime='text/csv', | |
) | |
# ================== | |
else: #select_model == 'SAM-IDM': | |
if st.button('Run'): | |
with st.spinner('Wait for it...'): | |
start_time = time.time() | |
################################ | |
# SAM-IDM model (LSTM) | |
################################ | |
df = pd.read_csv('LSTM/sample_data.csv') | |
print(df.head()) | |
sentences1 = list(df['sentences1']) | |
sentences2 = list(df['sentences2']) | |
tokenizer, embedding_matrix = word_embed_meta_data(sentences1 + sentences2, siamese_config['EMBEDDING_DIM']) | |
model = load_model( | |
"LSTM/choosed_checkpoit/lstm_50_50_0.17_0.25.h5", | |
None, False) | |
problem_corpus = pd.read_csv( | |
'data_problem_corpus/problem_corpus_full_cleaned.csv') | |
target_problem = user_input_patent_sentence | |
target_domain = user_input_domain | |
# remove the same domain's problems | |
problem_corpus = problem_corpus[problem_corpus.Domain != target_domain] | |
# choose the month period | |
problem_corpus = choosing_month_period(problem_corpus=problem_corpus, start_year=start_year, | |
end_year=end_year, start_month=start_month, end_month=end_month) | |
problem_corpus.reset_index(drop=True, inplace=True) # reset the index of the dataframe(must do this step) | |
print(problem_corpus) | |
print(problem_corpus.columns) | |
print('=======') | |
# read specific column | |
column1 = problem_corpus['First part Contradiction'] | |
print(type(column1)) | |
print(column1.head()) | |
print('++++++++++++++++') | |
for i in range(0, len(problem_corpus)): | |
ss1 = column1[i] | |
ss2 = target_problem | |
test_sentence_pairs = [(ss1, ss2)] | |
test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer, test_sentence_pairs, | |
siamese_config['MAX_SEQUENCE_LENGTH']) | |
pred = model.predict([test_data_x1, test_data_x2, leaks_test], batch_size=1000, verbose=2).ravel() | |
problem_corpus.loc[i, 'similarity_value_1'] = pred | |
# ========== | |
column2 = problem_corpus['Second part Contradiction'] | |
for i in range(0, len(problem_corpus)): | |
ss1 = column2[i] | |
ss2 = target_problem | |
test_sentence_pairs = [(ss1, ss2)] | |
test_data_x1, test_data_x2, leaks_test = create_test_data(tokenizer, test_sentence_pairs, | |
siamese_config['MAX_SEQUENCE_LENGTH']) | |
pred = model.predict([test_data_x1, test_data_x2, leaks_test], batch_size=1000, verbose=2).ravel() | |
problem_corpus.loc[i, 'similarity_value_2'] = pred | |
problem_corpus['target_problem'] = target_problem | |
problem_corpus = problem_corpus.round({'similarity_value_1': 2, 'similarity_value_2': 2}) # save 4 digits after point | |
print(problem_corpus.head()) | |
print(problem_corpus.columns) | |
problem_corpus_1 = problem_corpus[['patent_number', 'Domain', 'First part Contradiction', 'publication_date', 'publication_year','publication_month', 'label', 'similarity_value_1', 'target_problem']] | |
problem_corpus_1 = problem_corpus_1.rename(columns = {'First part Contradiction': 'problem', 'similarity_value_1' : 'similarity_value'}) | |
problem_corpus_2 = problem_corpus[ | |
['patent_number', 'Domain', 'Second part Contradiction', 'publication_date', 'publication_year', 'publication_month', 'label', | |
'similarity_value_2', 'target_problem']] | |
problem_corpus_2 = problem_corpus_2.rename(columns={'Second part Contradiction': 'problem', 'similarity_value_2' : 'similarity_value'}) | |
problem_corpus_final = pd.concat([problem_corpus_1, problem_corpus_2], ignore_index=True, sort=False) | |
print(problem_corpus_final) | |
print(problem_corpus_final.columns) | |
print(type(select_threshold)) | |
print(select_threshold) | |
print('=================') | |
# choose the resutls that are bigger than the similarity threshold | |
problem_corpus_final = problem_corpus_final[problem_corpus_final['similarity_value']>= select_threshold] | |
problem_corpus_final= problem_corpus_final[['patent_number', 'Domain','problem', 'similarity_value', 'target_problem']] | |
# dropping duplicate values | |
problem_corpus_final = problem_corpus_final.drop_duplicates(ignore_index=True) | |
print(problem_corpus_final) | |
st.success('Done!') | |
st.write("Process is finished within %s seconds" % round(time.time() - start_time, 2)) | |
# show results | |
st.dataframe(problem_corpus_final) | |
csv = convert_df(problem_corpus_final) # to download results | |
st.download_button( | |
label="Download", | |
data=csv, | |
file_name='results.csv', | |
mime='text/csv', | |
) | |
# future function: add function of providing own dataset | |
# ===================# | |
# Function 5 | |
# ===================# | |
if add_selectbox == '5. Problem-solution matching': | |
# st.title('PatentSolver_inventive solution matching') | |
app_target = "To provide latent inventive solutions for the target problem" | |
st.subheader(app_target) | |
st.caption('โจ๏ธโ Please use similar problem results from Function 4. ') | |
st.caption('๐ IDM-Matching model behind here is based on XLNet neural networks.') | |
uploaded_file = st.file_uploader("upload your similar problem file", type='csv') | |
if uploaded_file is not None: | |
# choose GPU | |
select_GPU = st.selectbox('Do you have GPU(s)?', | |
['No', 'Yes']) | |
st.caption('1. ๐ฐ We don\'t provide GPU since the cost. \n 2. ๐ข Please choose Yes when you run it on your own ' | |
'GPU and it will greatly accelerate the process.') | |
if select_GPU == 'No': | |
use_cuda = "False" | |
else: | |
use_cuda = "True" | |
if st.button('Run'): | |
with st.spinner('Wait for it...'): | |
start_time = time.time() | |
data = pd.read_csv(uploaded_file) | |
data = creat_query_id(data) | |
context_infor = pd.read_csv( | |
'data_problem_corpus/problem_corpus_full_cleaned.csv') | |
context_infor = context_infor[['patent_number', 'Context']] | |
# get context table | |
final_context = pd.merge(data, context_infor, on=['patent_number']) | |
final_context.to_csv( | |
'data_context/context_information.csv', | |
index=False) | |
print('++++++++++++') | |
print(final_context.head()) | |
print(final_context.columns) | |
csv_file = 'data_context/context_information.csv' | |
json_file = 'data_context/context_information.json' | |
csv_to_json(csv_file, json_file) # convert context.csv to context.json | |
prediction_file = 'data_context/context_information.json' | |
prediction_output = 'data_context/QA_result.json' | |
model = QuestionAnsweringModel('xlnet', 'trained_xlnet_model', | |
use_cuda=False) # when don't have GPU, choose use_cuda=False | |
QA_prediction(prediction_file, prediction_output, model) # predict solutions by QA system | |
input_file = 'data_context/QA_result.json' | |
output_file = 'data_context/QA_result.csv' | |
json_to_csv(input_file, output_file) | |
similarity_result = pd.read_csv( | |
'data_context/context_information.csv') | |
id_result = pd.read_csv( | |
'data_context/QA_result.csv') | |
final_result = similarity_result.merge(id_result, on=['id'], how='left') | |
print(final_result.head()) | |
final_result = final_result[ | |
['target_problem', 'problem', 'similarity_value', 'patent_number', 'Domain', 'answer']] | |
final_result = final_result.rename( | |
columns={'problem': 'similar_problem', 'answer': 'latent_inventive_solutions'}) | |
final_result.to_csv( | |
'data_context/QA_result_final.csv', | |
index=False) | |
st.dataframe(final_result) | |
csv = convert_df(final_result) # to download solution results | |
st.download_button( | |
label="Download", | |
data=csv, | |
file_name='results.csv', | |
mime='text/csv', | |
) | |
st.success('Done!') | |
st.write("Process is finished within %s seconds" % round(time.time() - start_time, 2)) | |
# ===================# | |
# Function 6 | |
# ===================# | |
if add_selectbox == '6. Inventive solutions ranking': | |
# st.title('PatentSolver_rank latent inventive solutions') | |
app_target = "To rank latent inventive solutions" | |
st.subheader(app_target) | |
st.caption('โจ๏ธโ Please use similar problem results from Function 5. ') | |
st.caption('๐โ ๏ธPatRIS model behind here is based on the multiple criteria decision analysis approach named TOPSIS.') | |
uploaded_file = st.file_uploader("upload your problem-solution file", type='csv') | |
if uploaded_file is not None: | |
if st.button('Run'): | |
st.write('Weight assignments:') | |
col1, col2, col3, col4, col5, col6 = st.columns(6) | |
col1.metric('IN', '0.1') | |
col2.metric('FCNF', '0.3') | |
col3.metric('FCYF', '0.1') | |
col4.metric('BCNF', '0.1') | |
col5.metric('BCYF', '0.1') | |
col6.metric('SV', '0.3') | |
with st.expander('See explanation'): | |
st.write('Inventive solutions ranking features: \n' | |
'IN (inventor_name): the number of inventors involved in the patent.\n' | |
'FCNF (forward_cite_no_family): Forward Citations that are not family-to-family cites.\n' | |
'FCYF (forward_cite_yes_family): Forward Citations that are family-to-family cites.\n' | |
'BCNF (backward_cite_no_family): Backward Citations that are not family-to-family cites.\n' | |
'BCYF (backward_cite_yes_family): Backward Citations that are family-to-family cites.\n' | |
'SV (similarity_value): similarity value between similar pairwise problems.\n') | |
with st.spinner('Wait for it...'): | |
start_time = time.time() | |
df = pd.read_csv(uploaded_file) | |
print(df.columns) | |
patent_number = [] | |
for patent in df['patent_number']: # take patent numbers | |
patent_number.append(patent) | |
print(patent_number) | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
# ~~~ Parameters for data_patent_details file ~~~ # | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
path_to_data = "MCDA/data/" #### don't forget to change | |
## Create csv file to store the data_patent_details from the patent runs | |
# (1) Specify column order of patents | |
# (2) Create csv if it does not exist in the data_patent_details path | |
data_column_order = ['inventor_name', | |
'assignee_name_orig', | |
'assignee_name_current', | |
'pub_date', | |
'priority_date', | |
'grant_date', | |
'filing_date', | |
'forward_cite_no_family', | |
'forward_cite_yes_family', | |
'backward_cite_no_family', | |
'backward_cite_yes_family', | |
'patent', | |
'url', | |
'abstract_text'] | |
if 'edison_patents.csv' in os.listdir(path_to_data): | |
os.remove(path_to_data + 'edison_patents.csv') # delete previous csv file | |
with open(path_to_data + 'edison_patents.csv', 'w', newline='') as file: | |
writer = csv.writer(file) | |
writer.writerow(data_column_order) | |
else: | |
with open(path_to_data + 'edison_patents.csv', 'w', newline='') as file: | |
writer = csv.writer(file) | |
writer.writerow(data_column_order) | |
# | |
# | |
########### Run pool process ############# | |
if __name__ == "__main__": | |
## Create lock to prevent collisions when processes try to write on same file | |
l = mp.Lock() | |
## Use a pool of workers where the number of processes is equal to | |
## the number of cpus - 1 | |
with poolcontext(processes=mp.cpu_count() - 1, initializer=init, initargs=(l,)) as pool: | |
pool.map(partial(single_process_scraper, path_to_data_file=path_to_data + 'edison_patents.csv', | |
data_column_order=data_column_order), | |
patent_number) | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
# ~~~ clean raw data_patent_details ~~~ # | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
##read Google scrawer's results | |
table = pd.read_csv( | |
'MCDA/data/edison_patents.csv') | |
# clean raw patent results | |
results = clean_patent(table) | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
# ~~~ count number ~~~ # | |
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # | |
results = count_patent(results) | |
print(results.columns) | |
results.to_csv( | |
'MCDA/data/cleaned_count_patents.csv', | |
index=False) | |
results_show = results[['patent_number', 'inventor_name', 'count_inventor_name', | |
'assignee_name_orig', 'count_assignee_name', 'assignee_name_current', | |
'count_assignee_name_current', 'forward_cite_no_family', | |
'count_forward_cite_no_family', 'forward_cite_yes_family', | |
'count_forward_cite_yes_family', 'backward_cite_no_family', | |
'count_backward_cite_no_family', 'backward_cite_yes_family', | |
'count_backward_cite_yes_family']] | |
st.write('Related patent details:') | |
st.dataframe(results_show) # show patent count details | |
print(len(df)) | |
print('==========') | |
# clean null soltuions | |
solutions = df[df['latent_inventive_solutions'] != '[]'] | |
print(len(solutions)) | |
count = results_show[['patent_number', 'count_inventor_name', 'count_forward_cite_no_family', | |
'count_forward_cite_yes_family', 'count_backward_cite_no_family', | |
'count_backward_cite_yes_family']] | |
count = pd.merge(count, solutions[['patent_number', 'similarity_value']], on='patent_number') | |
st.write('Solutions ranking criteria:') | |
st.dataframe(count) # show ranking criteria details | |
print('=======') | |
print(count.columns) | |
## project the goodness for each column | |
criteria_data = Data(count.iloc[:, 1:7], [MAX, MAX, MAX, MAX, MAX, MAX], | |
anames=count['patent_number'], | |
cnames=count.columns[1:7], | |
weights=[0.1, 0.3, 0.1, 0.1, 0.1, 0.3]) ##assign weights to attributes | |
print(criteria_data) | |
print('++++++++') | |
print('==========') | |
dm = closeness.TOPSIS( | |
mnorm="sum") # change the normalization criteria of the alternative matric to sum (divide every value by the sum opf their criteria) | |
dec = dm.decide(criteria_data) | |
print(dec) | |
print("Ideal:", dec.e_.ideal) | |
print("Anti-Ideal:", dec.e_.anti_ideal) | |
print("Closeness:", dec.e_.closeness) ##print each rank's value | |
count['rank_topsis'] = dec.e_.closeness | |
count = count.sort_values(by='rank_topsis', ascending=False) | |
print(count.columns) | |
print(count) | |
print(len(count)) | |
rank = [] | |
for i in range(len(count)): | |
i = i + 1 | |
rank.append(i) | |
print(rank) | |
count['rank'] = rank | |
print(count) | |
print(count.columns) | |
count = count[['rank', 'patent_number', 'count_inventor_name', 'count_forward_cite_no_family', | |
'count_forward_cite_yes_family', 'count_backward_cite_no_family', | |
'count_backward_cite_yes_family', 'similarity_value']] | |
final = pd.merge(count, df, on=('patent_number', 'similarity_value')) | |
final = final[ | |
['target_problem', 'latent_inventive_solutions', 'rank', 'similar_problem', 'similarity_value', | |
'Domain', 'patent_number', 'count_inventor_name', | |
'count_forward_cite_no_family', 'count_forward_cite_yes_family', | |
'count_backward_cite_no_family', 'count_backward_cite_yes_family']] | |
print('+++++') | |
print(final.columns) | |
st.write('Inventive solutions ranking results according to TOPSIS:') | |
st.dataframe(final) | |
st.success('Done!') | |
st.write("Process is finished within %s seconds" % round(time.time() - start_time, 2)) | |
csv = convert_df(final) # to download solution results | |
st.download_button( | |
label="Download", | |
data=csv, | |
file_name='results.csv', | |
mime='text/csv', | |
) | |