Spaces:

xin
/

PatentSolver

Build error

App Files Files Community

PatentSolver / App /bin /CorpusProcessor.py

xin

initial commit

22738ca almost 4 years ago

raw

history blame contribute delete

19.5 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --


	import json
	import os
	import re
	import matplotlib.pyplot as plt
	import numpy as np
	import Levenshtein
	from io import StringIO
	from App.bin import constants
	import hashlib
	from collections import OrderedDict
	from App.bin.InformationExtractor import InformationExtractor
	from App.bin.ParameterExtractor import ParameterExtractor
	from App.bin.TechnologyFinder import TechnologyFinder
	from App.bin.InformationExtractor_Claims import InformationExtractorClaims

	class CorpusProcessor(object):

	def __init__(self, patents,input_folder, file_extension):
	self.patents = patents
	self.input_folder = input_folder
	self.file_extension = file_extension
	print("Processing started")


	def make_graphic (self, sizes, text, colors, labels):

	col = [[i / 255. for i in c] for c in colors]

	fig, ax = plt.subplots()
	ax.axis('equal')
	width = 0.35
	kwargs = dict(colors=col, startangle=180)
	outside, _ = ax.pie(sizes, radius=1, pctdistance=1 - width / 2, labels=labels, **kwargs)
	plt.setp(outside, width=width, edgecolor='white')

	kwargs = dict(size=20, fontweight='bold', va='center')
	ax.text(0, 0, text, ha='center', **kwargs)

	plt.show()

	def change_keys(self, dictionnary, number):
	number = number+'-'
	if type(dictionnary) is dict:
	return dict([(number+str(k) , self.change_keys(v, number)) for k, v in dictionnary.items()])
	else:
	return dictionnary

	def process_corpus(self):

	count_abstract = 0
	count_claims = 0
	count_description = 0
	count_patent = 0
	total_sentences_number =0
	count_concepts_solupart = 0
	count_concepts_problem = 0
	patents = self.patents
	input_folder = self.input_folder
	file_extension = self.file_extension
	project_folder = os.path.basename(os.path.normpath(input_folder))
	graph_folder = constants.GRAPH_FOLDER + project_folder+"/"
	extracted_concepts = []
	output_result = []
	parameters_graph = []
	reduced_content = []
	patent_corpus = []
	source_list = []
	parameters_list =[]
	technologies_graph =[]


	for patent_file in patents:
	output_json_claims ={}
	total_sentences_number_claims =0

	if type(patent_file) is dict:
	patent_file = json.dumps(patent_file)

	read_patent = StringIO(patent_file)
	patent = json.load(read_patent)
	nNumber = patent['number']
	aAbstract = patent['abstract']
	cClaims = patent['claims']
	dDescription = patent['description']

	root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
	root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'

	if nNumber is not None:
	match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', nNumber)
	# CC for country code
	CC = match.group(1)
	# NR for Number
	NR = match.group(2)
	NR = re.sub(r'\s', '', NR)
	# KC for Kind code
	KC = match.group(4)

	urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC
	urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#'



	#Find a more elegant way to do it
	patent_content = aAbstract + cClaims + dDescription
	patent_content = patent_content.splitlines()
	# for line in patent_content:
	# line = self.dataCleaner(line)
	# reduced_content.append(line)

	for line in patent_content:
	get_parameters = ParameterExtractor(line)
	parameters = get_parameters.extract_parameters()
	if parameters:
	parameters_list.extend( parameters)
	for i in parameters_list:
	for j in parameters_list:
	if i != j and len(i.split()) == 1:
	if j.find(i) > -1 and i in parameters_list:

	parameters_list.remove(i)

	parameters_list=list(set(parameters_list))
	if len(parameters_list) > 50:
	for i in parameters_list:
	for j in parameters_list:
	if i!=j:
	comp = Levenshtein.ratio(i, j)
	if comp >=.4 and i in parameters_list and j in parameters_list:
	if len(i) > len(j):
	# print('{} is near duplicate of {}'.format(i, j))
	parameters_list.remove(i)

	for el in parameters_list:
	if len(el.split()) == 1:
	parameters_list.remove(el)

	parameters = dict(enumerate(parameters_list, 1))

	parameters = self.change_keys(parameters, nNumber.lower())



	source = input_folder+"/"+nNumber+file_extension.strip("*")

	parameters_array = OrderedDict({
	"concept": {
	"source": source,
	"valeurs": parameters,
	"image": urlImg,
	"pdf": urlPDF
	}

	})
	pParameters= json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))

	parameters_graph.append(pParameters)

	if dDescription !="" or cClaims!="":
	count_description +=1
	extract_concepts = InformationExtractor(dDescription,input_folder, file_extension, nNumber )
	output_json, total_sentences_number = extract_concepts.get_from_description()
	extract_concepts_claims = InformationExtractorClaims(cClaims,input_folder, file_extension, nNumber )
	output_json_claims_result= extract_concepts_claims.main()
	if output_json_claims_result is not None:
	output_json_claims, total_sentences_number_claims = output_json_claims_result

	count_claims += 1
	if output_json is not None:
	if type(output_json) is dict:
	output_json = json.dumps(output_json)
	extracted_concepts.append(output_json)
	total_sentences_number += total_sentences_number
	if output_json_claims is not None :
	if type(output_json_claims) is dict:
	output_json_claims = json.dumps(output_json_claims)
	extracted_concepts.append(output_json_claims)
	total_sentences_number += total_sentences_number_claims
	elif cClaims !="":
	count_claims +=1
	print('Processing claims')
	else:
	count_abstract +=1
	print("processing abstract")
	count_patent +=1


	#print(source)
	source_list.append(source)
	patent_corpus.append(reduced_content)
	patent_corpus = dict(zip(source_list, patent_corpus))
	'''
	get_patent_technologies = TechnologyFinder(patent_corpus)
	technologies = get_patent_technologies.get_technologies()


	for source_file, technologies_list in technologies.items():

	technologies_array = OrderedDict({
	"concept": {
	"source": source_file,
	"values": technologies_list
	}

	})
	tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))

	technologies_graph.append(tTechnologies)
	'''
	print(type(extracted_concepts))
	header = '{'
	graph = '"problem_graph": [%s],' % ','.join(extracted_concepts)
	parameters_output = '"parameters": [%s]' % ','.join(parameters_graph)
	#technologies_output = '"technologies": [%s]' % ','.join(technologies_graph)
	footer = '}'
	#output_result.extend((header, graph, parameters_output,technologies_output, footer ))
	output_result.extend((header, graph, parameters_output, footer))

	output_result = "".join(output_result)
	output_result = re.sub(r'\,{2,}', ',', output_result)
	output_result = re.sub(r'\}\,\]', '}]', output_result)


	# exit()
	# print(output_result)
	concepts_json = json.loads(output_result)

	# concepts_json = json.loads(concepts_json)


	count_concepts = len(concepts_json['problem_graph'])
	for item, value in concepts_json.items():
	#if cle == "type" and value =="partialSolution":
	# print ("yes")
	for element in value:
	for cle, valeur in element.items():
	for k,v in valeur.items():
	if k == "type" and v =="partialSolution":
	count_concepts_solupart += 1
	elif k == "type" and v =="problem":
	count_concepts_problem += 1
	json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
	#print(concepts_json.keys())

	# original code
	with open(graph_folder+"graph.json", 'w') as json_graph:

	# with open(graph_folder + 'graph.json', 'w') as json_graph:
	json_graph.write(json_write_to_file)
	number_neutre = count_concepts - count_concepts_problem - count_concepts_solupart
	print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (count_patent, count_abstract, count_claims, count_description))
	print("%s phrases ont été analysée(s)" % (total_sentences_number))
	print("%s concepts ont été trouvé(s) dont %s problèmes, %s solutions partielles et %s neutres" % (count_concepts, count_concepts_problem, count_concepts_solupart, number_neutre))

	#Display graphics
	first_color = (46, 204, 113)
	second_color = (245, 176, 65)
	#self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
	return json_write_to_file

	def process_corpus_json(self):

	count_abstract = 0
	count_claims = 0
	count_description = 0
	count_patent = 0
	total_sentences_number = 0
	count_concepts_solupart = 0
	count_concepts_problem = 0
	patents = self.patents
	input_folder = self.input_folder
	file_extension = self.file_extension
	project_folder = os.path.basename(os.path.normpath(input_folder))
	graph_folder = constants.GRAPH_FOLDER + project_folder + "/"
	extracted_concepts = []
	output_result = []
	parameters_graph = []
	reduced_content = []
	patent_corpus = []
	source_list = []
	parameters_list = []
	technologies_graph = []
	for patent_file in patents:
	# print(type(patent_file))

	#if type(patent_file) is dict:
	patent_file = json.dumps(patent_file)

	read_patent = StringIO(patent_file)
	patent = json.load(read_patent)
	# print(type(patent))
	filename = patent['filename']
	nNumber = patent['number']
	aAbstract = patent['abstract']
	cClaims = patent['claims']
	dDescription = patent['description']

	# Find a more elegant way to do it
	patent_content = aAbstract + cClaims + dDescription
	patent_content = patent_content.splitlines()
	# for line in patent_content:
	# line = self.dataCleaner(line)
	# reduced_content.append(line)

	for line in patent_content:
	get_parameters = ParameterExtractor(line)
	parameters = get_parameters.extract_parameters()
	if parameters:
	parameters_list.extend(parameters)
	for i in parameters_list:
	for j in parameters_list:
	if i != j and len(i.split()) == 1:
	if j.find(i) > -1 and i in parameters_list:

	parameters_list.remove(i)

	parameters_list = list(set(parameters_list))

	if len(parameters_list) > 50:
	for i in parameters_list:
	for j in parameters_list:
	if i!=j:
	comp = Levenshtein.ratio(i, j)
	if comp >=.4 and i in parameters_list and j in parameters_list:
	if len(i) > len(j):
	# print('{} is near duplicate of {}'.format(i, j))
	parameters_list.remove(i)

	for el in parameters_list:
	if len(el.split()) == 1:
	parameters_list.remove(el)





	print('{} {}'.format('Taille: ', len(parameters_list)))


	parameters = dict(enumerate(parameters_list, 1))

	parameters = self.change_keys(parameters, nNumber.lower())

	source = input_folder + "/" + nNumber + file_extension.strip("*")

	parameters_array = OrderedDict({
	"concept": {
	"source": source,
	"valeurs": parameters
	}

	})
	pParameters = json.dumps(parameters_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))

	parameters_graph.append(pParameters)

	#if dDescription != "" and cClaims!="":
	if dDescription != "":
	count_description += 1
	extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, filename)
	output_json, total_sentences_number_d = extract_concepts.get_from_description()
	if output_json != "":
	extracted_concepts.append(output_json)
	total_sentences_number += total_sentences_number_d
	#count_claims += 1
	#extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber)
	#output_json, total_sentences_number_c = extract_concepts.get_from_claims()
	#if output_json != "":
	#extracted_concepts.append(output_json)
	#total_sentences_number_c += total_sentences_number_c
	#total_sentences_number = total_sentences_number_c+total_sentences_number_d

	elif cClaims != "":
	count_claims += 1
	extract_concepts = InformationExtractor(cClaims, input_folder, file_extension, nNumber)
	output_json, total_sentences_number = extract_concepts.get_from_claims()
	if output_json != "":
	extracted_concepts.append(output_json)
	total_sentences_number += total_sentences_number
	elif dDescription != "":
	count_description += 1
	extract_concepts = InformationExtractor(dDescription, input_folder, file_extension, nNumber)
	output_json, total_sentences_number = extract_concepts.get_from_description()
	if output_json != "":
	extracted_concepts.append(output_json)
	total_sentences_number += total_sentences_number
	count_claims += 1

	else:
	count_abstract += 1
	print("processing abstract")
	count_patent += 1

	# print(source)
	# source_list.append(source)
	# patent_corpus.append(reduced_content)
	# patent_corpus = dict(zip(source_list, patent_corpus))
	'''
	get_patent_technologies = TechnologyFinder(patent_corpus)
	technologies = get_patent_technologies.get_technologies()


	for source_file, technologies_list in technologies.items():

	technologies_array = OrderedDict({
	"concept": {
	"source": source_file,
	"values": technologies_list
	}

	})
	tTechnologies = json.dumps(technologies_array, sort_keys=OrderedDict, indent=4, separators=(',', ': '))

	technologies_graph.append(tTechnologies)
	'''

	header = '{'
	graph = '"problem_graph": [%s],' % ','.join(extracted_concepts)
	parameters_output = '"parameters": [%s]' % ','.join(parameters_graph)
	# technologies_output = '"technologies": [%s]' % ','.join(technologies_graph)
	footer = '}'
	# output_result.extend((header, graph, parameters_output,technologies_output, footer ))
	output_result.extend((header, graph, parameters_output, footer))

	output_result = "".join(output_result)
	output_result = re.sub(r'\,{2,}', ',', output_result)
	output_result = re.sub(r'\}\,\]', '}]', output_result)
	concepts_json = json.loads(output_result)

	count_concepts = len(concepts_json['problem_graph'])
	for item, value in concepts_json.items():
	# if cle == "type" and value =="partialSolution":
	# print ("yes")
	for element in value:
	for cle, valeur in element.items():
	for k, v in valeur.items():
	if k == "type" and v == "partialSolution":
	count_concepts_solupart += 1
	elif k == "type" and v == "problem":
	count_concepts_problem += 1
	json_write_to_file = json.dumps(concepts_json, sort_keys=False, indent=4, separators=(',', ': '))
	# print(concepts_json.keys())
	with open(graph_folder + "graph.json", 'w') as json_graph:
	json_graph.write(json_write_to_file)

	print("Le corpus contenait %s brevets dont %s abstract, %s revendications et %s descriptions" % (
	count_patent, count_abstract, count_claims, count_description))
	print("%s phrases ont été analysée(s)" % (total_sentences_number))
	print("%s concepts ont été trouvé(s) dont %s problèmes et %s solutions partielles" % (
	count_concepts, count_concepts_problem, count_concepts_solupart))

	# Display graphics
	first_color = (46, 204, 113)
	second_color = (245, 176, 65)
	# self.make_graphic([count_concepts_problem, count_concepts_solupart], "Ratio",[first_color,second_color],['Problems','Partial Solutions'])
	return json_write_to_file