PatentSolver

Build error

File size: 29,443 Bytes

c13b805


# ~~~~~~~~~~~~~~~~~~~~~~~~ #
# ~~~ Import libraries ~~~ #
# ~~~~~~~~~~~~~~~~~~~~~~~~ #

# Google Scraper Class #
from google_patent_scraper import scraper_class

# Context Manager #
from contextlib import contextmanager

# Writing/Reading
import csv
import numpy as np
import pandas as pd

# clean patent #
import re

# Multiprocessing #
import multiprocessing as mp

# parse xml to text
from bs4 import BeautifulSoup as bs

# zip folder to download
import shutil
import base64
import streamlit as st
import os

# extract problems
from App.bin import constants
from App.bin.InputHandler import InputHandler
from App.bin.PatentHandler import PatentHandler
from App.bin.CorpusProcessor import CorpusProcessor
import json
from pandas import json_normalize
import glob



# ~~~~~~~~~~~~~~~~~~~ #
# ~~~~ Functions ~~~~ #
# ~~~~~~~~~~~~~~~~~~~ #

def single_process_scraper(patent,path_to_data_file,data_column_order):
    """Scrapes a single google patent using the google scraper class

       

       Function does not return any values, instead it writes the output

         of the data_patent_details into a csv file specified in the path_to_data_file

         parameter



       Inputs:

         patent (str) : patent number including country prefix

         lock (obj) : to prevent collisions, function uses a lock. You can pass whichever

                      lock you want to this parameter

         path_to_data_file : absolute path to csv file to write data_patent_details to

         data_column_order : name of columns in order they will be saved in csv file



    """
    # ~ Initialize scraper class ~ #
    scraper=scraper_class() 

    # ~ Scrape single patent ~ #
    err, soup, url = scraper.request_single_patent(patent)

    # Checks if the scrape is successful.
    # If successful -> parse text and deposit into csv file
    # Else          -> print error statement

    if err=='Success':
        patent_parsed = scraper.get_scraped_data(soup,url,patent)

        # Save the parsed data_patent_details to a csv file
        #  using multiprocessing lock function
        #  to prevent collisions
        with lock:
            with open(path_to_data_file,'a',newline='') as ofile:
                writer = csv.DictWriter(ofile, fieldnames=data_column_order)
                writer.writerow(patent_parsed)
    else:
        print('Patent {0} has error code {1}'.format(patent,err))

# Allow pool to accept keyword arguments
@contextmanager
def poolcontext(*args, **kwargs):
    pool = mp.Pool(*args, **kwargs)
    yield pool
    pool.terminate()

def init(l):
    """Creates lock object that is global, for use in sharing 

       across processes

    """
    global lock
    lock = l


def patentinput(patent_string):
    """

    remove space among patent numbers from users' inputs

    """
    patent_string = patent_string.replace(" ", "") #remove space that user tpyed
    list_results = list(patent_string.split(","))
    return list_results

def clean_patent(table):
    """clean raw patent details from website

    """

    list_inventor_name = np.array([]) # create an empty list

    inventor_name = table['inventor_name']
    for line in inventor_name:
        new_line = re.sub(r'"inventor_name":', '', line)
        new_line = re.sub(r'\{|\}|\[|\]|\"', '', new_line)
        # print(new_line)
        list_inventor_name = np.append(list_inventor_name, new_line)

    new_table_inventor_name = pd.DataFrame(list_inventor_name, columns=['inventor_name'])
    # new_table.to_csv('saved_data/cleaned_patent_details')

    ##clean assignee_name_orig feature
    list_assignee_name = np.array([])
    assignee_name = table['assignee_name_orig']
    for line in assignee_name:
        new_line = re.sub(r'"assignee_name":', '', line)  ##### errors
        new_line = re.sub(r'\{|\}|\[|\]|\"', '', new_line)
        list_assignee_name = np.append(list_assignee_name, new_line)

    new_table_assignee_name = pd.DataFrame(list_assignee_name, columns=['assignee_name_orig'])
    # print(new_table_assignee_name)
    #
    ##clean assignee_name_current feature
    list_assignee_name_current = np.array([])
    assignee_name_current = table['assignee_name_current']
    for line in assignee_name_current:
        new_line = re.sub(r'("assignee_name":)|(\\n\s\s)|(\{|\}|\[|\]|\")', '', line)
        list_assignee_name_current = np.append(list_assignee_name_current, new_line)

    new_table_assignee_name_current = pd.DataFrame(list_assignee_name_current, columns=['assignee_name_current'])
    # print(new_table_assignee_name_current)
    #
    ##clean forward_cite_no_family feature
    list_forward_cite_no_family = np.array([])
    forward_cite_no_family = table['forward_cite_no_family']
    for line in forward_cite_no_family:
        new_line = re.sub(
            r'("patent_number":)|(\\n)|(\{|\}|\[|\]|\")|(priority_date)|(:)|(pub_date)|(\d{4}-\d{2}-\d{2})', '', line)
        new_line = re.sub(r'\s\,\s', '', new_line)
        list_forward_cite_no_family = np.append(list_forward_cite_no_family, new_line)

    new_table_forward_cite_no_family = pd.DataFrame(list_forward_cite_no_family, columns=['forward_cite_no_family'])
    # print(new_table_forward_cite_no_family)
    #
    ##clean forward_cite_yes_family feature
    list_forward_cite_yes_family = np.array([])
    forward_cite_yes_family = table['forward_cite_yes_family']
    for line in forward_cite_yes_family:
        new_line = re.sub(
            r'("patent_number":)|(\\n)|(\{|\}|\[|\]|\")|(priority_date)|(:)|(pub_date)|(\d{4}-\d{2}-\d{2})', '', line)
        new_line = re.sub(r'\s\,\s', '', new_line)
        list_forward_cite_yes_family = np.append(list_forward_cite_yes_family, new_line)

    new_table_forward_cite_yes_family = pd.DataFrame(list_forward_cite_yes_family, columns=['forward_cite_yes_family'])
    # print(new_table_forward_cite_yes_family)

    ##clean backward_cite_no_family feature
    list_backward_cite_no_family = np.array([])
    backward_cite_no_family = table['backward_cite_no_family']
    for line in backward_cite_no_family:
        new_line = re.sub(
            r'("patent_number":)|(\\n)|(\{|\}|\[|\]|\")|(priority_date)|(:)|(pub_date)|(\d{4}-\d{2}-\d{2})', '', line)
        new_line = re.sub(r'\s\,\s', '', new_line)
        list_backward_cite_no_family = np.append(list_backward_cite_no_family, new_line)

    new_table_backward_cite_no_family = pd.DataFrame(list_backward_cite_no_family, columns=['backward_cite_no_family'])
    # print(new_table_backward_cite_no_family)

    ##clean backward_cite_yes_family feature
    list_backward_cite_yes_family = np.array([])
    backward_cite_yes_family = table['backward_cite_yes_family']
    for line in backward_cite_yes_family:
        new_line = re.sub(
            r'("patent_number":)|(\\n)|(\{|\}|\[|\]|\")|(priority_date)|(:)|(pub_date)|(\d{4}-\d{2}-\d{2})', '', line)
        new_line = re.sub(r'\s\,\s', '', new_line)
        list_backward_cite_yes_family = np.append(list_backward_cite_yes_family, new_line)

    new_table_backward_cite_yes_family = pd.DataFrame(list_backward_cite_yes_family,
                                                      columns=['backward_cite_yes_family'])
    # print(new_table_backward_cite_yes_family)

    ##rename url feature
    list_patent_number = np.array([])
    patent_number = table['url']
    for line in patent_number:
        list_patent_number = np.append(list_patent_number, line)

    new_table_patent_number = pd.DataFrame(list_patent_number, columns=['patent_number'])
    # print(new_table_patent_number)

    ##rename patent feature
    list_patent_link = np.array([])
    patent_link = table['patent']
    for line in patent_link:
        list_patent_link = np.append(list_patent_link, line)

    new_table_patent_link = pd.DataFrame(list_patent_link, columns=['patent_link'])
    # print(new_table_patent_link)

    ##rename abstract_text
    list_abstract_text = np.array([])
    abstract_text = table['abstract_text']
    for line in abstract_text:
        list_abstract_text = np.append(list_abstract_text, line)

    new_table_abstract_text = pd.DataFrame(abstract_text, columns=['abstract_text'])
    # print(new_table_patent_link)

    ###################################

    ## concatenate all of sub dataframes to the final results
    results = pd.concat([new_table_patent_number, table[['pub_date', 'priority_date', 'grant_date', 'filing_date']],
                         new_table_inventor_name, new_table_assignee_name, new_table_assignee_name_current,
                         new_table_forward_cite_no_family, new_table_forward_cite_yes_family,
                         new_table_backward_cite_yes_family, new_table_backward_cite_no_family, new_table_patent_link,
                         new_table_abstract_text], axis=1)

    return results


def count_patent(patent_table):
    """count the patent features"""

    ##count the number of assignee_name feature
    assignee_name = pd.DataFrame(patent_table['assignee_name_orig'])
    count_assignee_name = assignee_name.applymap(lambda x: str.count(x, ',') + 1)
    count_assignee_name = count_assignee_name.rename(columns={'assignee_name_orig': 'count_assignee_name'})
    # print(count_assignee_name)

    ##count the number of inventor_name feature
    inventor_name = pd.DataFrame(patent_table['inventor_name'])
    count_inventor_name = inventor_name.applymap(lambda x: str.count(x, ',') + 1)
    count_inventor_name = count_inventor_name.rename(columns={'inventor_name': 'count_inventor_name'})
    # print(count_inventor_name)

    ##count the number of assignee_name_current feature
    assignee_name_current = pd.DataFrame(patent_table['assignee_name_current'])
    # print(assignee_name_current)

    ##replace NaN as int(0)
    assignee_name_current_replace_NaN = lambda x: int(0) if pd.isnull(x) else str.count(x, ',') + 1
    count_assignee_name_current = assignee_name_current.applymap(assignee_name_current_replace_NaN)
    count_assignee_name_current = count_assignee_name_current.rename(
        columns={'assignee_name_current': 'count_assignee_name_current'})
    # print(count_assignee_name_current)

    ##count forward_cite_no_family
    forward_cite_no_family = pd.DataFrame(patent_table['forward_cite_no_family'])
    forward_cite_no_family_replace_NaN = lambda x: int(0) if pd.isnull(x) else str.count(x, ',')
    count_forward_cite_no_family = forward_cite_no_family.applymap(forward_cite_no_family_replace_NaN)
    count_forward_cite_no_family = count_forward_cite_no_family.rename(
        columns={'forward_cite_no_family': 'count_forward_cite_no_family'})
    # print(count_forward_cite_no_family)

    ##count forward_cite_yes_family
    forward_cite_yes_family = pd.DataFrame(patent_table['forward_cite_yes_family'])
    forward_cite_yes_family_replace_NaN = lambda x: int(0) if pd.isnull(x) else str.count(x, ',')
    count_forward_cite_yes_family = forward_cite_yes_family.applymap(forward_cite_yes_family_replace_NaN)
    count_forward_cite_yes_family = count_forward_cite_yes_family.rename(
        columns={'forward_cite_yes_family': 'count_forward_cite_yes_family'})
    # print(count_forward_cite_yes_family)

    ##count backward_cite_no_family
    backward_cite_no_family = pd.DataFrame(patent_table['backward_cite_no_family'])
    backward_cite_no_family_replace_NaN = lambda x: int(0) if pd.isnull(x) else str.count(x, ',')
    count_backward_cite_no_family = backward_cite_no_family.applymap(backward_cite_no_family_replace_NaN)
    count_backward_cite_no_family = count_backward_cite_no_family.rename(
        columns={'backward_cite_no_family': 'count_backward_cite_no_family'})
    # print(count_backward_cite_no_family)

    ##count backward_cite_yes_family
    backward_cite_yes_family = pd.DataFrame(patent_table['backward_cite_yes_family'])
    backward_cite_yes_family_replace_NaN = lambda x: int(0) if pd.isnull(x) else str.count(x, ',')
    count_backward_cite_yes_family = backward_cite_yes_family.applymap(backward_cite_yes_family_replace_NaN)
    count_backward_cite_yes_family = count_backward_cite_yes_family.rename(
        columns={'backward_cite_yes_family': 'count_backward_cite_yes_family'})
    # print(count_backward_cite_yes_family)

    ##concate dataframes to the final cleaned dataset
    results = pd.concat([patent_table[['patent_number', 'pub_date', 'priority_date',
                                'grant_date', 'filing_date', 'inventor_name']], count_inventor_name,
                         patent_table[['assignee_name_orig']], count_assignee_name,
                         patent_table[['assignee_name_current']], count_assignee_name_current,
                         patent_table[['forward_cite_no_family']], count_forward_cite_no_family,
                         patent_table[['forward_cite_yes_family']], count_forward_cite_yes_family,
                         patent_table[['backward_cite_no_family']], count_backward_cite_no_family,
                         patent_table[['backward_cite_yes_family']], count_backward_cite_yes_family,
                         patent_table[['patent_link', 'abstract_text']]], axis=1)

    return results


def XMLtoTEXT(patent_xml, saved_file_path):
    # read file
    tree = bs(patent_xml, "html.parser")

    # get title

    print('Title:')
    title = tree.find_all("invention-title")
    patent_title = title[0].text
    print(patent_title)

    # get number
    print("Patent number:")
    patent_number = tree.find_all('doc-number')
    patent_number = 'US' + patent_number[0].text
    patent_number_new = re.sub(r'US0', 'US', patent_number)
    print(patent_number_new)

    # get domain
    print('Domain:')
    domain = tree.find_all('classification-level')
    patent_domain = domain[0].text
    print(patent_domain)

    # get date of publication
    print("Publication date:")
    date = tree.find_all("date")
    patent_pubdate = date[0].text
    print(patent_pubdate)

    # get abstract
    print('Abstract:')
    ab = tree.find_all("abstract")
    patent_abstract = ab[0].text
    print(patent_abstract)

    # get claim
    print('Claims:')
    claims = tree.find_all("claim-text")
    for claim in claims:
        print(claim.text)

    # get description
    print('Description:')
    description = tree.find_all('description')
    for des in description:
        print(des.text)

    # save file to the place
    with open(saved_file_path + patent_number_new + '.txt', 'w') as text_file:
        text_file.write("Patent title" + '\n' + patent_title +
                        '\n' * 2 + "Patent number" + '\n' +
                        patent_number_new + '\n' * 2 + "Domain" + '\n' + patent_domain + '\n' * 2 + "Publication date" + '\n' + patent_pubdate
                        + '\n' * 2 + "Abstract" + '\n' + patent_abstract
                        + '\n' * 2 + 'Claims' + '\n')  # save patent title, number, domain, publication data_patent_details, abstract
        for claim in claims:
            text_file.write(claim.text + '\n')
        text_file.write('\n' + 'Description' + '\n')
        for des in description:
            text_file.write('\n' + des.text + '\n')

    return text_file


# to download patents (.txt) by zip file
def create_download_zip(zip_directory, zip_path, filename):
    """

        zip_directory (str): path to directory you want to zip

        zip_path (str): where you want to save zip file

        filename (str): download filename for user who download this

    """
    shutil.make_archive(zip_path+filename, 'zip', zip_directory)

    with open(zip_path+filename+'.zip', 'rb') as f:
        st.download_button(
            label = 'Download',
            data = f,
            file_name='patent.zip',
            mime= 'zip'
        )



# save input files (txt) into the folder
def save_uploadedfile(uploadedfile):
    with open(os.path.join('Data/input/US_patents/',uploadedfile.name ), 'wb') as f:
        f.write(uploadedfile.getbuffer())
        # return st.success('Saved File:{}'.format(uploadedfile.name))

# to extract problems from patents
def extractor (folder):
    input_folder = constants.DATA_INPUT + folder
    files_extension = "*." + 'txt'

    iInput = InputHandler(input_folder, files_extension)
    input_data = iInput.get_input()

    pretreat_data = PatentHandler(input_data)
    clean_patent_data = pretreat_data.pretreat_data()

    process_data = CorpusProcessor(clean_patent_data, input_folder, files_extension)
    processed_data = process_data.process_corpus()

    # convert json to dataframe
    with open('Data/graphs/US_patents/graph.json') as json_data:
        data = json.load(json_data)

    concept_df = json_normalize(data['problem_graph'], sep="_")

    concept_df = concept_df[['concept_sentence', 'concept_source', 'concept_type']]
    problem_df = concept_df.rename(columns={"concept_sentence": "problem", 'concept_source': 'patent_number',
                                            'concept_type': 'type'})
    # choose problems
    problem_new = problem_df.loc[problem_df['type'] == 'problem']

    print(problem_new)

    new_table_test = problem_new['patent_number'].apply(
        lambda x: re.search(r'(?<=US_patents\/).*?(?=.txt)', x).group())

    # assign patent number to the corresponding feature
    problem_results = problem_new.assign(patent_number=new_table_test)

    print(problem_results[['problem', 'patent_number']])
    problem_results = problem_results[['patent_number', 'problem']]
    problem_results.to_csv('data_problem/problem.csv',
                           index=False)

@st.cache
def convert_df(df):
    # IMPORTANT: Cache the conversion to prevent computation on every rerun
    return df.to_csv().encode('utf-8')


def extract_info_text():
    new = pd.DataFrame(columns=['title', 'patent_number', 'domain', 'publication_date'])

    # use glob to get all the txt files in the folder
    path = 'Data/input/US_patents'
    txt_files = glob.glob(os.path.join(path, "*.txt"))
    for f in txt_files:
        df = pd.read_csv(f, sep='\n', header=None, names=['content'])
        print(df)
        # extract patent information from text
        new = new.append({'patent_number': df.iloc[3, 0], 'title': df.iloc[1, 0],
                          'domain': df.iloc[5, 0], 'publication_date': df.iloc[7, 0]}, ignore_index=True)

    print(new)

    problem = pd.read_csv('data_problem/problem.csv')
    final = pd.merge(problem, new, on='patent_number', how='left')
    return final

def input_domain(user_input_domain):
    if user_input_domain == 'A (Human necessities)':
        domain = 'A'
    elif user_input_domain == 'B (Performing operations; transporting)':
        domain = 'B'
    elif user_input_domain == 'C (Chemistry; metallurgy)':
        domain = 'C'
    elif user_input_domain == 'D (Textiles; paper)':
        domain = 'D'
    elif user_input_domain == 'E (Fixed constructions)':
        domain = 'E'
    elif user_input_domain == 'F (Mechanical engineering; lighting; heating; weapons; blasting engines or pumps':
        domain = 'F'
    elif user_input_domain == 'G (Physics)':
        domain = 'G'
    elif user_input_domain == 'H (Electricity)':
        domain = 'H'
    return domain

# the function for choosing month period that user choosed
def choosing_month_period(problem_corpus,start_year, end_year, start_month, end_month):
    problem_corpus = problem_corpus[problem_corpus['publication_year'].between(start_year, end_year)]
    if start_year != end_year:  # 2014- 2015 #2014- 2016
        if start_month == end_month:  # /01/  /01/
            if end_year == start_year + 1:  # 2014/03/01 - 2015/03/01 #2014/01/01 - 2015/01/23 #2014/12/01 - 2015/12/23
                problem_corpus.loc[(problem_corpus['publication_year'] == start_year) & (
                    problem_corpus['publication_month'].between(start_month, 12)), 'label'] = 'true'
                problem_corpus.loc[(problem_corpus['publication_year'] == end_year) & (
                    problem_corpus['publication_month'].between(1, end_month)), 'label'] = 'true'

            elif end_year > start_year + 1:  # 2014/01/01 - 2016/01/23 #2014/12/01 - 2016/12/23 # 2014/03/01 - 2016/03/01
                if start_month == 1:  # 2014/01/01 - 2016/01/23
                    problem_corpus.loc[(
                                               problem_corpus['publication_year'] == end_year) & (
                                       problem_corpus['publication_month'].between(
                                           end_month + 1, 12)), 'label'] = 'false'
                    problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
                elif start_month == 12:  # 2014/12/01 - 2016/12/23
                    problem_corpus.loc[(
                                               problem_corpus['publication_year'] == start_year) & (
                                       problem_corpus['publication_month'].between(
                                           1, start_month - 1)), 'label'] = 'false'
                    problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
                else:  # 2014/03/01 - 2016/03/01
                    problem_corpus.loc[(
                                               problem_corpus['publication_year'] == start_year) & (
                                       problem_corpus['publication_month'].between(
                                           1, start_month - 1)), 'label'] = 'false'
                    problem_corpus.loc[(
                                               problem_corpus['publication_year'] == end_year) & (
                                       problem_corpus['publication_month'].between(
                                           end_month + 1, 12)), 'label'] = 'false'
                    problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
        if start_month > end_month:  # /03/  /01/
            if end_year == start_year + 1:  # 2014/12/01 - 2015/03/01 #2014/02/01 - 2015/01/23
                problem_corpus.loc[(problem_corpus['publication_year'] == start_year) & (
                    problem_corpus['publication_month'].between(start_month, 12)), 'label'] = 'true'
                problem_corpus.loc[(problem_corpus['publication_year'] == end_year) & (
                    problem_corpus['publication_month'].between(1, end_month)), 'label'] = 'true'

            elif end_year > start_year + 1:  # 2014/12/01 - 2016/03/01 #2014/02/01 - 2016/01/23
                problem_corpus.loc[(
                                           problem_corpus['publication_year'] == start_year) & (
                                   problem_corpus['publication_month'].between(
                                       1, start_month - 1)), 'label'] = 'false'
                problem_corpus.loc[(
                                           problem_corpus['publication_year'] == end_year) & (
                                   problem_corpus['publication_month'].between(
                                       end_month + 1, 12)), 'label'] = 'false'
                problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'

        if start_month < end_month:  # /01/  /03/
            if end_year == start_year + 1:  # 2014/01/01 - 2015/12/01 #2014/02/01 - 2015/11/23
                problem_corpus.loc[(problem_corpus['publication_year'] == start_year) & (
                    problem_corpus['publication_month'].between(start_month, 12)), 'label'] = 'true'
                problem_corpus.loc[(problem_corpus['publication_year'] == end_year) & (
                    problem_corpus['publication_month'].between(1, end_month)), 'label'] = 'true'

            elif end_year > start_year + 1:  # 2014/01/01 - 2016/12/01 #2014/02/01 - 2016/11/23
                if start_month == 1 & end_month == 12:  # 2014/01/01 - 2016/12/01
                    problem_corpus['label'] = 'true'
                elif start_month == 1:  # 2014/01/01 - 2016/03/01 #2014/01/01 - 2016/11/01
                    problem_corpus.loc[(problem_corpus['publication_year'] == end_year) & (problem_corpus[
                                                                                               'publication_month'].between(
                        end_month + 1, 12)), 'label'] = 'false'
                    problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
                elif end_month == 12:  # 2014/02/01 - 2016/12/01 #2015/02/01 - 2016/12/01
                    problem_corpus.loc[(problem_corpus['publication_year'] == start_year) & (problem_corpus[
                                                                                                 'publication_month'].between(
                        1, start_month - 1)), 'label'] = 'false'
                    problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'
                else:  # 2014/02/01 - 2016/11/23
                    problem_corpus.loc[(problem_corpus['publication_year'] == start_year) & (problem_corpus[
                                                                                                 'publication_month'].between(
                        1, start_month - 1)), 'label'] = 'false'
                    problem_corpus.loc[(problem_corpus['publication_year'] == end_year) & (problem_corpus[
                                                                                               'publication_month'].between(
                        end_month + 1, 12)), 'label'] = 'false'
                    problem_corpus.loc[(problem_corpus.label != 'false'), 'label'] = 'true'



    else:  # start_year == end_year: 2012-2012
        problem_corpus = problem_corpus[problem_corpus['publication_year'] == start_year]
        if start_month != end_month:  # 2014/03/01 - 2014/05/01 2014/01/01 - 2014/05/01 2014/03/01 - 2014/12/01
            problem_corpus.loc[problem_corpus['publication_month'].between(start_month, end_month), 'label'] = 'true'
        else:  # 2014/03/01 - 2014/03/20 #2014/01/01 - 2014/01/20
            problem_corpus.loc[problem_corpus['publication_month'] == start_month, 'label'] = 'true'

    problem_corpus = problem_corpus.loc[problem_corpus['label'] == 'true']
    problem_corpus= problem_corpus[['patent_number', 'Domain', 'First part Contradiction',
                                     'Second part Contradiction', 'publication_date', 'publication_year',
                                     'publication_month', 'label']]
    return problem_corpus

# for IDM-Similar model (word2vec)
def avg_feature_vector(sentence, model, num_features, index2word_set):
    words = sentence.split()
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec

def creat_query_id(dataset):
    # create query
    question = []
    for each in dataset['problem']:
        new = "What is the solution for the problem that " + each + "?"
        question.append(new)
    dataset['question'] = question

    # create id
    data = dataset.rename(columns={'Unnamed: 0': 'id'})
    return data

def csv_to_json (csv_file,json_file):
    results = []
    with open(csv_file) as csv_file:
        csvReader = csv.DictReader(csv_file)
        for row in csvReader:
            context = row['Context']
            qas = []
            content = {}
            content['id'] = row['id']
            content['question'] = row['question']
            qas.append(content)
            result = {}
            result['context'] = context
            result['qas'] = qas
            results.append(result)

    # write data to a json file
    with open(json_file, 'w') as jsonFile:
        jsonFile.write(json.dumps(results, indent=4))



def QA_prediction(prediction_file, prediction_output, model):
    # if __name__ == '__main__':
    with open(prediction_file, 'r') as pre_file:
        temp = json.loads(pre_file.read())
        predictions = model.predict(temp)

    with open(prediction_output, 'w') as json_file:
        json_file.write(json.dumps(predictions, indent=4))
    print(predictions)

def json_to_csv(input_file, output_file):
    result = pd.read_json(input_file)
    print(result.head())

    result_answer = result.iloc[0][:]
    print(result_answer.head())
    print(len(result_answer))

    df = pd.DataFrame(index=np.arange(len(result_answer)), columns=['id', 'answer'])
    print(df)

    for i in range(len(result_answer)):
        line = result_answer[i]
        print(line)
        df.iloc[i, 0] = line['id']
        df.iloc[i, 1] = line['answer']

    print(df.head())
    df.to_csv(output_file, index=False)