PatentSolver / functions.py
RedBaron5's picture
Duplicate from xin/PatentSolver
c13b805
raw
history blame
29.4 kB
# ~~~~~~~~~~~~~~~~~~~~~~~~ #
# ~~~ Import libraries ~~~ #
# ~~~~~~~~~~~~~~~~~~~~~~~~ #
# Google Scraper Class #
from google_patent_scraper import scraper_class
# Context Manager #
from contextlib import contextmanager
# Writing/Reading
import csv
import numpy as np
import pandas as pd
# clean patent #
import re
# Multiprocessing #
import multiprocessing as mp
# parse xml to text
from bs4 import BeautifulSoup as bs
# zip folder to download
import shutil
import base64
import streamlit as st
import os
# extract problems
from App.bin import constants
from App.bin.InputHandler import InputHandler
from App.bin.PatentHandler import PatentHandler
from App.bin.CorpusProcessor import CorpusProcessor
import json
from pandas import json_normalize
import glob
# ~~~~~~~~~~~~~~~~~~~ #
# ~~~~ Functions ~~~~ #
# ~~~~~~~~~~~~~~~~~~~ #
def single_process_scraper(patent,path_to_data_file,data_column_order):
"""Scrapes a single google patent using the google scraper class
Function does not return any values, instead it writes the output
of the data_patent_details into a csv file specified in the path_to_data_file
parameter
Inputs:
patent (str) : patent number including country prefix
lock (obj) : to prevent collisions, function uses a lock. You can pass whichever
lock you want to this parameter
path_to_data_file : absolute path to csv file to write data_patent_details to
data_column_order : name of columns in order they will be saved in csv file
"""
# ~ Initialize scraper class ~ #
scraper=scraper_class()
# ~ Scrape single patent ~ #
err, soup, url = scraper.request_single_patent(patent)
# Checks if the scrape is successful.
# If successful -> parse text and deposit into csv file
# Else -> print error statement
if err=='Success':
patent_parsed = scraper.get_scraped_data(soup,url,patent)
# Save the parsed data_patent_details to a csv file
# using multiprocessing lock function
# to prevent collisions
with lock:
with open(path_to_data_file,'a',newline='') as ofile:
writer = csv.DictWriter(ofile, fieldnames=data_column_order)
writer.writerow(patent_parsed)
else:
print('Patent {0} has error code {1}'.format(patent,err))
# Allow pool to accept keyword arguments
@contextmanager
def poolcontext(*args, **kwargs):
pool = mp.Pool(*args, **kwargs)
yield pool
pool.terminate()
def init(l):
"""Creates lock object that is global, for use in sharing
across processes
"""
global lock
lock = l
def patentinput(patent_string):
"""
remove space among patent numbers from users' inputs
"""
patent_string = patent_string.replace(" ", "") #remove space that user tpyed
list_results = list(patent_string.split(","))
return list_results
def clean_patent(table):
"""clean raw patent details from website
"""
list_inventor_name = np.array([]) # create an empty list
inventor_name = table['inventor_name']
for line in inventor_name:
new_line = re.sub(r'"inventor_name":', '', line)
new_line = re.sub(r'\{|\}|\[|\]|\"', '', new_line)
# print(new_line)
list_inventor_name = np.append(list_inventor_name, new_line)
new_table_inventor_name = pd.DataFrame(list_inventor_name, columns=['inventor_name'])
# new_table.to_csv('saved_data/cleaned_patent_details')
##clean assignee_name_orig feature
list_assignee_name = np.array([])
assignee_name = table['assignee_name_orig']
for line in assignee_name:
new_line = re.sub(r'"assignee_name":', '', line) ##### errors
new_line = re.sub(r'\{|\}|\[|\]|\"', '', new_line)
list_assignee_name = np.append(list_assignee_name, new_line)
new_table_assignee_name = pd.DataFrame(list_assignee_name, columns=['assignee_name_orig'])
# print(new_table_assignee_name)
#
##clean assignee_name_current feature
list_assignee_name_current = np.array([])
assignee_name_current = table['assignee_name_current']
for line in assignee_name_current:
new_line = re.sub(r'("assignee_name":)|(\\n\s\s)|(\{|\}|\[|\]|\")', '', line)
list_assignee_name_current = np.append(list_assignee_name_current, new_line)
new_table_assignee_name_current = pd.DataFrame(list_assignee_name_current, columns=['assignee_name_current'])
# print(new_table_assignee_name_current)
#
##clean forward_cite_no_family feature
list_forward_cite_no_family = np.array([])
forward_cite_no_family = table['forward_cite_no_family']
for line in forward_cite_no_family:
new_line = re.sub(
r'("patent_number":)|(\\n)|(\{|\}|\[|\]|\")|(priority_date)|(:)|(pub_date)|(\d{4}-\d{2}-\d{2})', '', line)
new_line = re.sub(r'\s\,\s', '', new_line)
list_forward_cite_no_family = np.append(list_forward_cite_no_family, new_line)
new_table_forward_cite_no_family = pd.DataFrame(list_forward_cite_no_family, columns=['forward_cite_no_family'])
# print(new_table_forward_cite_no_family)
#
##clean forward_cite_yes_family feature
list_forward_cite_yes_family = np.array([])
forward_cite_yes_family = table['forward_cite_yes_family']
for line in forward_cite_yes_family:
new_line = re.sub(
r'("patent_number":)|(\\n)|(\{|\}|\[|\]|\")|(priority_date)|(:)|(pub_date)|(\d{4}-\d{2}-\d{2})', '', line)
new_line = re.sub(r'\s\,\s', '', new_line)
list_forward_cite_yes_family = np.append(list_forward_cite_yes_family, new_line)
new_table_forward_cite_yes_family = pd.DataFrame(list_forward_cite_yes_family, columns=['forward_cite_yes_family'])
# print(new_table_forward_cite_yes_family)
##clean backward_cite_no_family feature
list_backward_cite_no_family = np.array([])
backward_cite_no_family = table['backward_cite_no_family']
for line in backward_cite_no_family:
new_line = re.sub(
r'("patent_number":)|(\\n)|(\{|\}|\[|\]|\")|(priority_date)|(:)|(pub_date)|(\d{4}-\d{2}-\d{2})', '', line)
new_line = re.sub(r'\s\,\s', '', new_line)
list_backward_cite_no_family = np.append(list_backward_cite_no_family, new_line)
new_table_backward_cite_no_family = pd.DataFrame(list_backward_cite_no_family, columns=['backward_cite_no_family'])
# print(new_table_backward_cite_no_family)
##clean backward_cite_yes_family feature
list_backward_cite_yes_family = np.array([])
backward_cite_yes_family = table['backward_cite_yes_family']
for line in backward_cite_yes_family:
new_line = re.sub(
r'("patent_number":)|(\\n)|(\{|\}|\[|\]|\")|(priority_date)|(:)|(pub_date)|(\d{4}-\d{2}-\d{2})', '', line)
new_line = re.sub(r'\s\,\s', '', new_line)
list_backward_cite_yes_family = np.append(list_backward_cite_yes_family, new_line)
new_table_backward_cite_yes_family = pd.DataFrame(list_backward_cite_yes_family,
columns=['backward_cite_yes_family'])
# print(new_table_backward_cite_yes_family)
##rename url feature
list_patent_number = np.array([])
patent_number = table['url']
for line in patent_number:
list_patent_number = np.append(list_patent_number, line)
new_table_patent_number = pd.DataFrame(list_patent_number, columns=['patent_number'])
# print(new_table_patent_number)
##rename patent feature
list_patent_link = np.array([])
patent_link = table['patent']
for line in patent_link:
list_patent_link = np.append(list_patent_link, line)
new_table_patent_link = pd.DataFrame(list_patent_link, columns=['patent_link'])
# print(new_table_patent_link)
##rename abstract_text
list_abstract_text = np.array([])
abstract_text = table['abstract_text']
for line in abstract_text:
list_abstract_text = np.append(list_abstract_text, line)
new_table_abstract_text = pd.DataFrame(abstract_text, columns=['abstract_text'])
# print(new_table_patent_link)
###################################
## concatenate all of sub dataframes to the final results
results = pd.concat([new_table_patent_number, table[['pub_date', 'priority_date', 'grant_date', 'filing_date']],
new_table_inventor_name, new_table_assignee_name, new_table_assignee_name_current,
new_table_forward_cite_no_family, new_table_forward_cite_yes_family,
new_table_backward_cite_yes_family, new_table_backward_cite_no_family, new_table_patent_link,
new_table_abstract_text], axis=1)
return results
def count_patent(patent_table):
"""count the patent features"""
##count the number of assignee_name feature
assignee_name = pd.DataFrame(patent_table['assignee_name_orig'])
count_assignee_name = assignee_name.applymap(lambda x: str.count(x, ',') + 1)
count_assignee_name = count_assignee_name.rename(columns={'assignee_name_orig': 'count_assignee_name'})
# print(count_assignee_name)
##count the number of inventor_name feature
inventor_name = pd.DataFrame(patent_table['inventor_name'])
count_inventor_name = inventor_name.applymap(lambda x: str.count(x, ',') + 1)
count_inventor_name = count_inventor_name.rename(columns={'inventor_name': 'count_inventor_name'})
# print(count_inventor_name)
##count the number of assignee_name_current feature
assignee_name_current = pd.DataFrame(patent_table['assignee_name_current'])
# print(assignee_name_current)
##replace NaN as int(0)
assignee_name_current_replace_NaN = lambda x: int(0) if pd.isnull(x) else str.count(x, ',') + 1
count_assignee_name_current = assignee_name_current.applymap(assignee_name_current_replace_NaN)
count_assignee_name_current = count_assignee_name_current.rename(
columns={'assignee_name_current': 'count_assignee_name_current'})
# print(count_assignee_name_current)
##count forward_cite_no_family
forward_cite_no_family = pd.DataFrame(patent_table['forward_cite_no_family'])
forward_cite_no_family_replace_NaN = lambda x: int(0) if pd.isnull(x) else str.count(x, ',')
count_forward_cite_no_family = forward_cite_no_family.applymap(forward_cite_no_family_replace_NaN)
count_forward_cite_no_family = count_forward_cite_no_family.rename(
columns={'forward_cite_no_family': 'count_forward_cite_no_family'})
# print(count_forward_cite_no_family)
##count forward_cite_yes_family
forward_cite_yes_family = pd.DataFrame(patent_table['forward_cite_yes_family'])
forward_cite_yes_family_replace_NaN = lambda x: int(0) if pd.isnull(x) else str.count(x, ',')
count_forward_cite_yes_family = forward_cite_yes_family.applymap(forward_cite_yes_family_replace_NaN)
count_forward_cite_yes_family = count_forward_cite_yes_family.rename(
columns={'forward_cite_yes_family': 'count_forward_cite_yes_family'})
# print(count_forward_cite_yes_family)
##count backward_cite_no_family
backward_cite_no_family = pd.DataFrame(patent_table['backward_cite_no_family'])
backward_cite_no_family_replace_NaN = lambda x: int(0) if pd.isnull(x) else str.count(x, ',')
count_backward_cite_no_family = backward_cite_no_family.applymap(backward_cite_no_family_replace_NaN)
count_backward_cite_no_family = count_backward_cite_no_family.rename(
columns={'backward_cite_no_family': 'count_backward_cite_no_family'})
# print(count_backward_cite_no_family)
##count backward_cite_yes_family
backward_cite_yes_family = pd.DataFrame(patent_table['backward_cite_yes_family'])
backward_cite_yes_family_replace_NaN = lambda x: int(0) if pd.isnull(x) else str.count(x, ',')
count_backward_cite_yes_family = backward_cite_yes_family.applymap(backward_cite_yes_family_replace_NaN)
count_backward_cite_yes_family = count_backward_cite_yes_family.rename(
columns={'backward_cite_yes_family': 'count_backward_cite_yes_family'})
# print(count_backward_cite_yes_family)
##concate dataframes to the final cleaned dataset
results = pd.concat([patent_table[['patent_number', 'pub_date', 'priority_date',
'grant_date', 'filing_date', 'inventor_name']], count_inventor_name,
patent_table[['assignee_name_orig']], count_assignee_name,
patent_table[['assignee_name_current']], count_assignee_name_current,
patent_table[['forward_cite_no_family']], count_forward_cite_no_family,
patent_table[['forward_cite_yes_family']], count_forward_cite_yes_family,
patent_table[['backward_cite_no_family']], count_backward_cite_no_family,
patent_table[['backward_cite_yes_family']], count_backward_cite_yes_family,
patent_table[['patent_link', 'abstract_text']]], axis=1)
return results
def XMLtoTEXT(patent_xml, saved_file_path):
# read file
tree = bs(patent_xml, "html.parser")
# get title
print('Title:')
title = tree.find_all("invention-title")
patent_title = title[0].text
print(patent_title)
# get number
print("Patent number:")
patent_number = tree.find_all('doc-number')
patent_number = 'US' + patent_number[0].text
patent_number_new = re.sub(r'US0', 'US', patent_number)
print(patent_number_new)
# get domain
print('Domain:')
domain = tree.find_all('classification-level')
patent_domain = domain[0].text
print(patent_domain)
# get date of publication
print("Publication date:")
date = tree.find_all("date")
patent_pubdate = date[0].text
print(patent_pubdate)
# get abstract
print('Abstract:')
ab = tree.find_all("abstract")
patent_abstract = ab[0].text
print(patent_abstract)
# get claim
print('Claims:')
claims = tree.find_all("claim-text")
for claim in claims:
print(claim.text)
# get description
print('Description:')
description = tree.find_all('description')
for des in description:
print(des.text)
# save file to the place
with open(saved_file_path + patent_number_new + '.txt', 'w') as text_file:
text_file.write("Patent title" + '\n' + patent_title +
'\n' * 2 + "Patent number" + '\n' +
patent_number_new + '\n' * 2 + "Domain" + '\n' + patent_domain + '\n' * 2 + "Publication date" + '\n' + patent_pubdate
+ '\n' * 2 + "Abstract" + '\n' + patent_abstract
+ '\n' * 2 + 'Claims' + '\n') # save patent title, number, domain, publication data_patent_details, abstract
for claim in claims:
text_file.write(claim.text + '\n')
text_file.write('\n' + 'Description' + '\n')
for des in description:
text_file.write('\n' + des.text + '\n')
return text_file
# to download patents (.txt) by zip file
def create_download_zip(zip_directory, zip_path, filename):
"""
zip_directory (str): path to directory you want to zip
zip_path (str): where you want to save zip file
filename (str): download filename for user who download this
"""
shutil.make_archive(zip_path+filename, 'zip', zip_directory)
with open(zip_path+filename+'.zip', 'rb') as f:
st.download_button(
label = 'Download',
data = f,
file_name='patent.zip',
mime= 'zip'
)
# save input files (txt) into the folder
def save_uploadedfile(uploadedfile):
with open(os.path.join('Data/input/US_patents/',uploadedfile.name ), 'wb') as f:
f.write(uploadedfile.getbuffer())
# return st.success('Saved File:{}'.format(uploadedfile.name))
# to extract problems from patents
def extractor (folder):
input_folder = constants.DATA_INPUT + folder
files_extension = "*." + 'txt'
iInput = InputHandler(input_folder, files_extension)
input_data = iInput.get_input()
pretreat_data = PatentHandler(input_data)
clean_patent_data = pretreat_data.pretreat_data()
process_data = CorpusProcessor(clean_patent_data, input_folder, files_extension)
processed_data = process_data.process_corpus()
# convert json to dataframe
with open('Data/graphs/US_patents/graph.json') as json_data:
data = json.load(json_data)
concept_df = json_normalize(data['problem_graph'], sep="_")
concept_df = concept_df[['concept_sentence', 'concept_source', 'concept_type']]
problem_df = concept_df.rename(columns={"concept_sentence": "problem", 'concept_source': 'patent_number',
'concept_type': 'type'})
# choose problems
problem_new = problem_df.loc[problem_df['type'] == 'problem']
print(problem_new)
new_table_test = problem_new['patent_number'].apply(
lambda x: re.search(r'(?<=US_patents\/).*?(?=.txt)', x).group())
# assign patent number to the corresponding feature
problem_results = problem_new.assign(patent_number=new_table_test)
print(problem_results[['problem', 'patent_number']])
problem_results = problem_results[['patent_number', 'problem']]
problem_results.to_csv('data_problem/problem.csv',
index=False)
@st.cache
def convert_df(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv().encode('utf-8')
def extract_info_text():
new = pd.DataFrame(columns=['title', 'patent_number', 'domain', 'publication_date'])
# use glob to get all the txt files in the folder
path = 'Data/input/US_patents'
txt_files = glob.glob(os.path.join(path, "*.txt"))
for f in txt_files:
df = pd.read_csv(f, sep='\n', header=None, names=['content'])
print(df)
# extract patent information from text
new = new.append({'patent_number': df.iloc[3, 0], 'title': df.iloc[1, 0],
'domain': df.iloc[5, 0], 'publication_date': df.iloc[7, 0]}, ignore_index=True)
print(new)
problem = pd.read_csv('data_problem/problem.csv')
final = pd.merge(problem, new, on='patent_number', how='left')
return final
def input_domain(user_input_domain):
if user_input_domain == 'A (Human necessities)':
domain = 'A'
elif user_input_domain == 'B (Performing operations; transporting)':
domain = 'B'
elif user_input_domain == 'C (Chemistry; metallurgy)':
domain = 'C'
elif user_input_domain == 'D (Textiles; paper)':
domain = 'D'
elif user_input_domain == 'E (Fixed constructions)':
domain = 'E'
elif user_input_domain == 'F (Mechanical engineering; lighting; heating; weapons; blasting engines or pumps':
domain = 'F'
elif user_input_domain == 'G (Physics)':
domain = 'G'
elif user_input_domain == 'H (Electricity)':
domain = 'H'
return domain
# the function for choosing month period that user choosed
def choosing_month_period(problem_corpus,start_year, end_year, start_month, end_month):
problem_corpus = problem_corpus[problem_corpus['publication_year'].between(start_year, end_year)]
if start_year != end_year: # 2014- 2015 #2014- 2016
if start_month == end_month: # /01/ /01/
if end_year == start_year + 1: # 2014/03/01 - 2015/03/01 #2014/01/01 - 2015/01/23 #2014/12/01 - 2015/12/23
problem_corpus.loc[(problem_corpus['publication_year'] == start_year) & (
problem_corpus['publication_month'].between(start_month, 12)), 'label'] = 'true'
problem_corpus.loc[(problem_corpus['publication_year'] == end_year) & (
problem_corpus['publication_month'].between(1, end_month)), 'label'] = 'true'
elif end_year > start_year + 1: # 2014/01/01 - 2016/01/23 #2014/12/01 - 2016/12/23 # 2014/03/01 - 2016/03/01
if start_month == 1: # 2014/01/01 - 2016/01/23
problem_corpus.loc[(
problem_corpus['publication_year'] == end_year) & (
problem_corpus['publication_month'].between(
end_month + 1, 12)), 'label'] = 'false'
problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
elif start_month == 12: # 2014/12/01 - 2016/12/23
problem_corpus.loc[(
problem_corpus['publication_year'] == start_year) & (
problem_corpus['publication_month'].between(
1, start_month - 1)), 'label'] = 'false'
problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
else: # 2014/03/01 - 2016/03/01
problem_corpus.loc[(
problem_corpus['publication_year'] == start_year) & (
problem_corpus['publication_month'].between(
1, start_month - 1)), 'label'] = 'false'
problem_corpus.loc[(
problem_corpus['publication_year'] == end_year) & (
problem_corpus['publication_month'].between(
end_month + 1, 12)), 'label'] = 'false'
problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
if start_month > end_month: # /03/ /01/
if end_year == start_year + 1: # 2014/12/01 - 2015/03/01 #2014/02/01 - 2015/01/23
problem_corpus.loc[(problem_corpus['publication_year'] == start_year) & (
problem_corpus['publication_month'].between(start_month, 12)), 'label'] = 'true'
problem_corpus.loc[(problem_corpus['publication_year'] == end_year) & (
problem_corpus['publication_month'].between(1, end_month)), 'label'] = 'true'
elif end_year > start_year + 1: # 2014/12/01 - 2016/03/01 #2014/02/01 - 2016/01/23
problem_corpus.loc[(
problem_corpus['publication_year'] == start_year) & (
problem_corpus['publication_month'].between(
1, start_month - 1)), 'label'] = 'false'
problem_corpus.loc[(
problem_corpus['publication_year'] == end_year) & (
problem_corpus['publication_month'].between(
end_month + 1, 12)), 'label'] = 'false'
problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
if start_month < end_month: # /01/ /03/
if end_year == start_year + 1: # 2014/01/01 - 2015/12/01 #2014/02/01 - 2015/11/23
problem_corpus.loc[(problem_corpus['publication_year'] == start_year) & (
problem_corpus['publication_month'].between(start_month, 12)), 'label'] = 'true'
problem_corpus.loc[(problem_corpus['publication_year'] == end_year) & (
problem_corpus['publication_month'].between(1, end_month)), 'label'] = 'true'
elif end_year > start_year + 1: # 2014/01/01 - 2016/12/01 #2014/02/01 - 2016/11/23
if start_month == 1 & end_month == 12: # 2014/01/01 - 2016/12/01
problem_corpus['label'] = 'true'
elif start_month == 1: # 2014/01/01 - 2016/03/01 #2014/01/01 - 2016/11/01
problem_corpus.loc[(problem_corpus['publication_year'] == end_year) & (problem_corpus[
'publication_month'].between(
end_month + 1, 12)), 'label'] = 'false'
problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
elif end_month == 12: # 2014/02/01 - 2016/12/01 #2015/02/01 - 2016/12/01
problem_corpus.loc[(problem_corpus['publication_year'] == start_year) & (problem_corpus[
'publication_month'].between(
1, start_month - 1)), 'label'] = 'false'
problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
else: # 2014/02/01 - 2016/11/23
problem_corpus.loc[(problem_corpus['publication_year'] == start_year) & (problem_corpus[
'publication_month'].between(
1, start_month - 1)), 'label'] = 'false'
problem_corpus.loc[(problem_corpus['publication_year'] == end_year) & (problem_corpus[
'publication_month'].between(
end_month + 1, 12)), 'label'] = 'false'
problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
else: # start_year == end_year: 2012-2012
problem_corpus = problem_corpus[problem_corpus['publication_year'] == start_year]
if start_month != end_month: # 2014/03/01 - 2014/05/01 2014/01/01 - 2014/05/01 2014/03/01 - 2014/12/01
problem_corpus.loc[problem_corpus['publication_month'].between(start_month, end_month), 'label'] = 'true'
else: # 2014/03/01 - 2014/03/20 #2014/01/01 - 2014/01/20
problem_corpus.loc[problem_corpus['publication_month'] == start_month, 'label'] = 'true'
problem_corpus = problem_corpus.loc[problem_corpus['label'] == 'true']
problem_corpus= problem_corpus[['patent_number', 'Domain', 'First part Contradiction',
'Second part Contradiction', 'publication_date', 'publication_year',
'publication_month', 'label']]
return problem_corpus
# for IDM-Similar model (word2vec)
def avg_feature_vector(sentence, model, num_features, index2word_set):
words = sentence.split()
feature_vec = np.zeros((num_features, ), dtype='float32')
n_words = 0
for word in words:
if word in index2word_set:
n_words += 1
feature_vec = np.add(feature_vec, model[word])
if (n_words > 0):
feature_vec = np.divide(feature_vec, n_words)
return feature_vec
def creat_query_id(dataset):
# create query
question = []
for each in dataset['problem']:
new = "What is the solution for the problem that " + each + "?"
question.append(new)
dataset['question'] = question
# create id
data = dataset.rename(columns={'Unnamed: 0': 'id'})
return data
def csv_to_json (csv_file,json_file):
results = []
with open(csv_file) as csv_file:
csvReader = csv.DictReader(csv_file)
for row in csvReader:
context = row['Context']
qas = []
content = {}
content['id'] = row['id']
content['question'] = row['question']
qas.append(content)
result = {}
result['context'] = context
result['qas'] = qas
results.append(result)
# write data to a json file
with open(json_file, 'w') as jsonFile:
jsonFile.write(json.dumps(results, indent=4))
def QA_prediction(prediction_file, prediction_output, model):
# if __name__ == '__main__':
with open(prediction_file, 'r') as pre_file:
temp = json.loads(pre_file.read())
predictions = model.predict(temp)
with open(prediction_output, 'w') as json_file:
json_file.write(json.dumps(predictions, indent=4))
print(predictions)
def json_to_csv(input_file, output_file):
result = pd.read_json(input_file)
print(result.head())
result_answer = result.iloc[0][:]
print(result_answer.head())
print(len(result_answer))
df = pd.DataFrame(index=np.arange(len(result_answer)), columns=['id', 'answer'])
print(df)
for i in range(len(result_answer)):
line = result_answer[i]
print(line)
df.iloc[i, 0] = line['id']
df.iloc[i, 1] = line['answer']
print(df.head())
df.to_csv(output_file, index=False)