PatentSolver / App /bin /PGProcessor.py
RedBaron5's picture
Duplicate from xin/PatentSolver
c13b805
# -*- coding: utf-8 -*-
import json
import os
import re
import matplotlib.pyplot as plt
import numpy as np
from io import StringIO
from App4api.bin import constants
from collections import OrderedDict
from App4api.bin.InformationExtractor import InformationExtractor
from App4api.bin.ParameterExtractor import ParameterExtractor
from App4api.bin.TechnologyFinder import TechnologyFinder
class PGProcessor(object):
def __init__(self, patents,input_folder, file_extension):
self.patents = patents
self.input_folder = input_folder
self.file_extension = file_extension
print("Processing started")
def process_corpus(self):
count_abstract = 0
count_claims = 0
count_description = 0
count_patent = 0
total_sentences_number =0
count_concepts_solupart = 0
count_concepts_problem = 0
patents = self.patents
input_folder = self.input_folder
file_extension = self.file_extension
project_folder = os.path.basename(os.path.normpath(input_folder))
graph_folder = constants.GRAPH_FOLDER + project_folder+"/"
extracted_concepts = []
output_result = []
parameters_graph = []
reduced_content = []
patent_corpus = []
source_list = []
parameters_list =[]
technologies_graph =[]
for patent_file in patents:
read_patent = StringIO(patent_file)
patent = json.load(read_patent)
nNumber = patent['number']
aAbstract = patent['abstract']
cClaims = patent['claims']
dDescription = patent['description']
source = patent['source']
if dDescription !="":
count_description +=1
extract_concepts = InformationExtractor(dDescription,input_folder, file_extension, nNumber, source )
output_json, total_sentences_number = extract_concepts.get_from_description()
if output_json !="":
extracted_concepts.append(output_json)
total_sentences_number += total_sentences_number
elif cClaims !="":
count_claims +=1
print('Processing claims')
else:
count_abstract +=1
print("processing abstract")
count_patent +=1
#print(source)
source_list.append(source)
header = '{'
graph = '"problem_graph": [%s]' % ','.join(extracted_concepts)
footer = '}'
output_result.extend((header, graph, footer))
output_result = "".join(output_result)
concepts_json = json.loads(output_result)
count_concepts = len(concepts_json['problem_graph'])
for item, value in concepts_json.items():
#if cle == "type" and value =="partialSolution":
# print ("yes")
for element in value:
for cle, valeur in element.items():
for k,v in valeur.items():
if k == "type" and v =="partialSolution":
count_concepts_solupart += 1
elif k == "type" and v =="problem":
count_concepts_problem += 1
json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
#print(concepts_json.keys())
with open(graph_folder+"graph.json", 'w') as json_graph:
json_graph.write(json_write_to_file)
print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (count_patent, count_abstract, count_claims, count_description))
print("%s phrases ont été analysée(s)" % (total_sentences_number))
print("%s concepts ont été trouvé(s) dont %s problèmes et %s solutions partielles" % (count_concepts, count_concepts_problem, count_concepts_solupart))
#Display graphics
first_color = (46, 204, 113)
second_color = (245, 176, 65)
#self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
return concepts_json