Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| import json | |
| import os | |
| import re | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| import Levenshtein | |
| from io import StringIO | |
| from App.bin import constants | |
| import hashlib | |
| from collections import OrderedDict | |
| from App.bin.InformationExtractor import InformationExtractor | |
| from App.bin.ParameterExtractor import ParameterExtractor | |
| from App.bin.TechnologyFinder import TechnologyFinder | |
| from App.bin.InformationExtractor_Claims import InformationExtractorClaims | |
| class CorpusProcessor(object): | |
| def __init__(self, patents,input_folder, file_extension): | |
| self.patents = patents | |
| self.input_folder = input_folder | |
| self.file_extension = file_extension | |
| print("Processing started") | |
| def make_graphic (self, sizes, text, colors, labels): | |
| col = [[i / 255. for i in c] for c in colors] | |
| fig, ax = plt.subplots() | |
| ax.axis('equal') | |
| width = 0.35 | |
| kwargs = dict(colors=col, startangle=180) | |
| outside, _ = ax.pie(sizes, radius=1, pctdistance=1 - width / 2, labels=labels, **kwargs) | |
| plt.setp(outside, width=width, edgecolor='white') | |
| kwargs = dict(size=20, fontweight='bold', va='center') | |
| ax.text(0, 0, text, ha='center', **kwargs) | |
| plt.show() | |
| def change_keys(self, dictionnary, number): | |
| number = number+'-' | |
| if type(dictionnary) is dict: | |
| return dict([(number+str(k) , self.change_keys(v, number)) for k, v in dictionnary.items()]) | |
| else: | |
| return dictionnary | |
| def process_corpus(self): | |
| count_abstract = 0 | |
| count_claims = 0 | |
| count_description = 0 | |
| count_patent = 0 | |
| total_sentences_number =0 | |
| count_concepts_solupart = 0 | |
| count_concepts_problem = 0 | |
| patents = self.patents | |
| input_folder = self.input_folder | |
| file_extension = self.file_extension | |
| project_folder = os.path.basename(os.path.normpath(input_folder)) | |
| graph_folder = constants.GRAPH_FOLDER + project_folder+"/" | |
| extracted_concepts = [] | |
| output_result = [] | |
| parameters_graph = [] | |
| reduced_content = [] | |
| patent_corpus = [] | |
| source_list = [] | |
| parameters_list =[] | |
| technologies_graph =[] | |
| for patent_file in patents: | |
| output_json_claims ={} | |
| total_sentences_number_claims =0 | |
| if type(patent_file) is dict: | |
| patent_file = json.dumps(patent_file) | |
| read_patent = StringIO(patent_file) | |
| patent = json.load(read_patent) | |
| nNumber = patent['number'] | |
| aAbstract = patent['abstract'] | |
| cClaims = patent['claims'] | |
| dDescription = patent['description'] | |
| root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&' | |
| root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?' | |
| if nNumber is not None: | |
| match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', nNumber) | |
| # CC for country code | |
| CC = match.group(1) | |
| # NR for Number | |
| NR = match.group(2) | |
| NR = re.sub(r'\s', '', NR) | |
| # KC for Kind code | |
| KC = match.group(4) | |
| urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC | |
| urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#' | |
| #Find a more elegant way to do it | |
| patent_content = aAbstract + cClaims + dDescription | |
| patent_content = patent_content.splitlines() | |
| # for line in patent_content: | |
| # line = self.dataCleaner(line) | |
| # reduced_content.append(line) | |
| for line in patent_content: | |
| get_parameters = ParameterExtractor(line) | |
| parameters = get_parameters.extract_parameters() | |
| if parameters: | |
| parameters_list.extend( parameters) | |
| for i in parameters_list: | |
| for j in parameters_list: | |
| if i != j and len(i.split()) == 1: | |
| if j.find(i) > -1 and i in parameters_list: | |
| parameters_list.remove(i) | |
| parameters_list=list(set(parameters_list)) | |
| if len(parameters_list) > 50: | |
| for i in parameters_list: | |
| for j in parameters_list: | |
| if i!=j: | |
| comp = Levenshtein.ratio(i, j) | |
| if comp >=.4 and i in parameters_list and j in parameters_list: | |
| if len(i) > len(j): | |
| # print('{} is near duplicate of {}'.format(i, j)) | |
| parameters_list.remove(i) | |
| for el in parameters_list: | |
| if len(el.split()) == 1: | |
| parameters_list.remove(el) | |
| parameters = dict(enumerate(parameters_list, 1)) | |
| parameters = self.change_keys(parameters, nNumber.lower()) | |
| source = input_folder+"/"+nNumber+file_extension.strip("*") | |
| parameters_array = OrderedDict({ | |
| "concept": { | |
| "source": source, | |
| "valeurs": parameters, | |
| "image": urlImg, | |
| "pdf": urlPDF | |
| } | |
| }) | |
| pParameters= json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
| parameters_graph.append(pParameters) | |
| if dDescription !="" or cClaims!="": | |
| count_description +=1 | |
| extract_concepts = InformationExtractor(dDescription,input_folder, file_extension, nNumber ) | |
| output_json, total_sentences_number = extract_concepts.get_from_description() | |
| extract_concepts_claims = InformationExtractorClaims(cClaims,input_folder, file_extension, nNumber ) | |
| output_json_claims_result= extract_concepts_claims.main() | |
| if output_json_claims_result is not None: | |
| output_json_claims, total_sentences_number_claims = output_json_claims_result | |
| count_claims += 1 | |
| if output_json is not None: | |
| if type(output_json) is dict: | |
| output_json = json.dumps(output_json) | |
| extracted_concepts.append(output_json) | |
| total_sentences_number += total_sentences_number | |
| if output_json_claims is not None : | |
| if type(output_json_claims) is dict: | |
| output_json_claims = json.dumps(output_json_claims) | |
| extracted_concepts.append(output_json_claims) | |
| total_sentences_number += total_sentences_number_claims | |
| elif cClaims !="": | |
| count_claims +=1 | |
| print('Processing claims') | |
| else: | |
| count_abstract +=1 | |
| print("processing abstract") | |
| count_patent +=1 | |
| #print(source) | |
| source_list.append(source) | |
| patent_corpus.append(reduced_content) | |
| patent_corpus = dict(zip(source_list, patent_corpus)) | |
| ''' | |
| get_patent_technologies = TechnologyFinder(patent_corpus) | |
| technologies = get_patent_technologies.get_technologies() | |
| for source_file, technologies_list in technologies.items(): | |
| technologies_array = OrderedDict({ | |
| "concept": { | |
| "source": source_file, | |
| "values": technologies_list | |
| } | |
| }) | |
| tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
| technologies_graph.append(tTechnologies) | |
| ''' | |
| print(type(extracted_concepts)) | |
| header = '{' | |
| graph = '"problem_graph": [%s],' % ','.join(extracted_concepts) | |
| parameters_output = '"parameters": [%s]' % ','.join(parameters_graph) | |
| #technologies_output = '"technologies": [%s]' % ','.join(technologies_graph) | |
| footer = '}' | |
| #output_result.extend((header, graph, parameters_output,technologies_output, footer )) | |
| output_result.extend((header, graph, parameters_output, footer)) | |
| output_result = "".join(output_result) | |
| output_result = re.sub(r'\,{2,}', ',', output_result) | |
| output_result = re.sub(r'\}\,\]', '}]', output_result) | |
| # exit() | |
| # print(output_result) | |
| concepts_json = json.loads(output_result) | |
| # concepts_json = json.loads(concepts_json) | |
| count_concepts = len(concepts_json['problem_graph']) | |
| for item, value in concepts_json.items(): | |
| #if cle == "type" and value =="partialSolution": | |
| # print ("yes") | |
| for element in value: | |
| for cle, valeur in element.items(): | |
| for k,v in valeur.items(): | |
| if k == "type" and v =="partialSolution": | |
| count_concepts_solupart += 1 | |
| elif k == "type" and v =="problem": | |
| count_concepts_problem += 1 | |
| json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': ')) | |
| #print(concepts_json.keys()) | |
| # original code | |
| with open(graph_folder+"graph.json", 'w') as json_graph: | |
| # with open(graph_folder + 'graph.json', 'w') as json_graph: | |
| json_graph.write(json_write_to_file) | |
| number_neutre = count_concepts - count_concepts_problem - count_concepts_solupart | |
| print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (count_patent, count_abstract, count_claims, count_description)) | |
| print("%s phrases ont été analysée(s)" % (total_sentences_number)) | |
| print("%s concepts ont été trouvé(s) dont %s problèmes, %s solutions partielles et %s neutres" % (count_concepts, count_concepts_problem, count_concepts_solupart, number_neutre)) | |
| #Display graphics | |
| first_color = (46, 204, 113) | |
| second_color = (245, 176, 65) | |
| #self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions']) | |
| return json_write_to_file | |
| def process_corpus_json(self): | |
| count_abstract = 0 | |
| count_claims = 0 | |
| count_description = 0 | |
| count_patent = 0 | |
| total_sentences_number = 0 | |
| count_concepts_solupart = 0 | |
| count_concepts_problem = 0 | |
| patents = self.patents | |
| input_folder = self.input_folder | |
| file_extension = self.file_extension | |
| project_folder = os.path.basename(os.path.normpath(input_folder)) | |
| graph_folder = constants.GRAPH_FOLDER + project_folder + "/" | |
| extracted_concepts = [] | |
| output_result = [] | |
| parameters_graph = [] | |
| reduced_content = [] | |
| patent_corpus = [] | |
| source_list = [] | |
| parameters_list = [] | |
| technologies_graph = [] | |
| for patent_file in patents: | |
| # print(type(patent_file)) | |
| #if type(patent_file) is dict: | |
| patent_file = json.dumps(patent_file) | |
| read_patent = StringIO(patent_file) | |
| patent = json.load(read_patent) | |
| # print(type(patent)) | |
| filename = patent['filename'] | |
| nNumber = patent['number'] | |
| aAbstract = patent['abstract'] | |
| cClaims = patent['claims'] | |
| dDescription = patent['description'] | |
| # Find a more elegant way to do it | |
| patent_content = aAbstract + cClaims + dDescription | |
| patent_content = patent_content.splitlines() | |
| # for line in patent_content: | |
| # line = self.dataCleaner(line) | |
| # reduced_content.append(line) | |
| for line in patent_content: | |
| get_parameters = ParameterExtractor(line) | |
| parameters = get_parameters.extract_parameters() | |
| if parameters: | |
| parameters_list.extend(parameters) | |
| for i in parameters_list: | |
| for j in parameters_list: | |
| if i != j and len(i.split()) == 1: | |
| if j.find(i) > -1 and i in parameters_list: | |
| parameters_list.remove(i) | |
| parameters_list = list(set(parameters_list)) | |
| if len(parameters_list) > 50: | |
| for i in parameters_list: | |
| for j in parameters_list: | |
| if i!=j: | |
| comp = Levenshtein.ratio(i, j) | |
| if comp >=.4 and i in parameters_list and j in parameters_list: | |
| if len(i) > len(j): | |
| # print('{} is near duplicate of {}'.format(i, j)) | |
| parameters_list.remove(i) | |
| for el in parameters_list: | |
| if len(el.split()) == 1: | |
| parameters_list.remove(el) | |
| print('{} {}'.format('Taille: ', len(parameters_list))) | |
| parameters = dict(enumerate(parameters_list, 1)) | |
| parameters = self.change_keys(parameters, nNumber.lower()) | |
| source = input_folder + "/" + nNumber + file_extension.strip("*") | |
| parameters_array = OrderedDict({ | |
| "concept": { | |
| "source": source, | |
| "valeurs": parameters | |
| } | |
| }) | |
| pParameters = json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
| parameters_graph.append(pParameters) | |
| #if dDescription != "" and cClaims!="": | |
| if dDescription != "": | |
| count_description += 1 | |
| extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, filename) | |
| output_json, total_sentences_number_d = extract_concepts.get_from_description() | |
| if output_json != "": | |
| extracted_concepts.append(output_json) | |
| total_sentences_number += total_sentences_number_d | |
| #count_claims += 1 | |
| #extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber) | |
| #output_json, total_sentences_number_c = extract_concepts.get_from_claims() | |
| #if output_json != "": | |
| #extracted_concepts.append(output_json) | |
| #total_sentences_number_c += total_sentences_number_c | |
| #total_sentences_number = total_sentences_number_c+total_sentences_number_d | |
| elif cClaims != "": | |
| count_claims += 1 | |
| extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber) | |
| output_json, total_sentences_number = extract_concepts.get_from_claims() | |
| if output_json != "": | |
| extracted_concepts.append(output_json) | |
| total_sentences_number += total_sentences_number | |
| elif dDescription != "": | |
| count_description += 1 | |
| extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, nNumber) | |
| output_json, total_sentences_number = extract_concepts.get_from_description() | |
| if output_json != "": | |
| extracted_concepts.append(output_json) | |
| total_sentences_number += total_sentences_number | |
| count_claims += 1 | |
| else: | |
| count_abstract += 1 | |
| print("processing abstract") | |
| count_patent += 1 | |
| # print(source) | |
| # source_list.append(source) | |
| # patent_corpus.append(reduced_content) | |
| # patent_corpus = dict(zip(source_list, patent_corpus)) | |
| ''' | |
| get_patent_technologies = TechnologyFinder(patent_corpus) | |
| technologies = get_patent_technologies.get_technologies() | |
| for source_file, technologies_list in technologies.items(): | |
| technologies_array = OrderedDict({ | |
| "concept": { | |
| "source": source_file, | |
| "values": technologies_list | |
| } | |
| }) | |
| tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
| technologies_graph.append(tTechnologies) | |
| ''' | |
| header = '{' | |
| graph = '"problem_graph": [%s],' % ','.join(extracted_concepts) | |
| parameters_output = '"parameters": [%s]' % ','.join(parameters_graph) | |
| # technologies_output = '"technologies": [%s]' % ','.join(technologies_graph) | |
| footer = '}' | |
| # output_result.extend((header, graph, parameters_output,technologies_output, footer )) | |
| output_result.extend((header, graph, parameters_output, footer)) | |
| output_result = "".join(output_result) | |
| output_result = re.sub(r'\,{2,}', ',', output_result) | |
| output_result = re.sub(r'\}\,\]', '}]', output_result) | |
| concepts_json = json.loads(output_result) | |
| count_concepts = len(concepts_json['problem_graph']) | |
| for item, value in concepts_json.items(): | |
| # if cle == "type" and value =="partialSolution": | |
| # print ("yes") | |
| for element in value: | |
| for cle, valeur in element.items(): | |
| for k, v in valeur.items(): | |
| if k == "type" and v == "partialSolution": | |
| count_concepts_solupart += 1 | |
| elif k == "type" and v == "problem": | |
| count_concepts_problem += 1 | |
| json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': ')) | |
| # print(concepts_json.keys()) | |
| with open(graph_folder + "graph.json", 'w') as json_graph: | |
| json_graph.write(json_write_to_file) | |
| print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % ( | |
| count_patent, count_abstract, count_claims, count_description)) | |
| print("%s phrases ont été analysée(s)" % (total_sentences_number)) | |
| print("%s concepts ont été trouvé(s) dont %s problèmes et %s solutions partielles" % ( | |
| count_concepts, count_concepts_problem, count_concepts_solupart)) | |
| # Display graphics | |
| first_color = (46, 204, 113) | |
| second_color = (245, 176, 65) | |
| # self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions']) | |
| return json_write_to_file |