Spaces:

xin
/

PatentSolver

Build error

App Files Files Community

PatentSolver / App /bin /InformationExtractor.py

xin

initial commit

22738ca almost 4 years ago

raw

history blame contribute delete

24.8 kB

	# -- coding: utf-8 --

	#java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080
	import nltk
	nltk.download('all')
	import os
	import re
	import json
	import hashlib
	import Levenshtein
	import uuid
	from App.bin import constants
	from collections import OrderedDict
	from nltk import word_tokenize

	from App.bin.SharpClassifier import SharpClassifier
	from App.bin.ClassifierWithIncr import ClassifyWithIncr_it
	from App.bin.SentenceClassifier import SentenceClassifier
	from App.bin.ParameterExtractor import ParameterExtractor

	class InformationExtractor(object):

	patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split()
	sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle')
	sentence_finder._params.abbrev_types.update(patent_abbreviations)

	def __init__(self, section, input_folder,file_extension, file_name):
	self.section = section
	self.input_folder = input_folder
	self.file_extension = file_extension
	self.file_name = file_name

	print("Extracting problem graph")

	#@staticmethod


	def discardLines(self, line,lexic):
	with open (constants.ASSETS+ lexic) as m:
	exclusion_list = m.read().splitlines()
	if any(word in line for word in exclusion_list):
	pass
	else:
	return line


	def selectLines(self, line, lexic):
	with open(constants.ASSETS + lexic) as n:
	inclusion_list = n.read().splitlines()
	if any(word in line for word in inclusion_list):
	return line

	def last_cleansing(self, concept):
	concept = str(concept)
	concept = concept.lower()
	if concept.endswith("."):
	concept = concept.strip(".")
	concept = re.sub(r'^consequently ','', concept)
	concept = re.sub(r'^such ', '', concept)
	concept = re.sub(r'^said ', '', concept)
	concept = re.sub(r'^\s+', '', concept)
	concept = re.sub(r'^it is worth noting that ', '', concept)
	concept = re.sub(r'^example of ', '', concept)
	concept = re.sub(r'^since ', '', concept)
	concept = re.sub(r'^$ \|$$ ', '', concept)
	return concept

	# def get_from_claims(self):
	#
	# section = self.section
	# content = []
	# sentence_finder = InformationExtractor.sentence_finder
	# sentences = sentence_finder.tokenize(section.strip())
	# with open(constants.ASSETS + "getFromClaims") as concept:
	# # next(concept)
	# included_words = concept.read().splitlines()
	# include_link_pattern = re.compile('\|'.join(included_words))


	def get_from_description(self):
	previous_polarity = ''
	noise_trash =[]

	content = []
	include_links = []
	output_content = []
	ex_output_content = []
	output_result=[]
	output_linked_content = []
	output_inter_content = []
	uniq_output_linked_content =[]
	ex_output_content_linked =[]
	section = self.section
	input_folder = self.input_folder
	file_name = self.file_name
	file_extension = self.file_extension
	projectFolder = os.path.basename(os.path.normpath(input_folder))
	output_file_name = input_folder+"/"+file_name+file_extension.strip("*")

	graphItemId = hashlib.md5(file_name.encode())
	graphItemIdValue = graphItemId.hexdigest()
	graphItemIdValue = str(uuid.uuid4())
	t_sline = ""
	t_sline_ex =[]
	compt_Id = 30
	compt_Id_ex = 40

	root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&'
	root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?'

	if file_name is not None:
	match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name)
	# CC for country code
	CC = match.group(1)
	# NR for Number
	NR = match.group(2)
	NR = re.sub(r'\s', '', NR)
	# KC for Kind code
	KC = match.group(4)

	urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC
	urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#'

	sentence_finder = InformationExtractor.sentence_finder

	#section = self.dataCleaner(section)
	#print(section)
	sentences = sentence_finder.tokenize(section.strip())


	with open(constants.ASSETS + "includeLinks") as concept:
	# next(concept)
	included_words = concept.read().splitlines()
	include_link_pattern = re.compile('\|'.join(included_words))
	#open examplification wordfile
	with open(constants.ASSETS + "examplificationclues") as examplif:
	# next(concept)
	exam_words = examplif.read().splitlines()
	examplif_word_pattern = re.compile('\|'.join(exam_words))

	description_sentences_number = len(sentences)
	number_of_words = 0
	for sentence in sentences:

	# with open(constants.DATA + 'sentences.txt', 'a', encoding='utf8') as file_handler:
	# for item in sentences:
	# file_handler.write("{}\n".format(item))
	number_of_word = len(nltk.word_tokenize(sentence))
	number_of_words += number_of_word


	sentenced = self.discardLines(sentence, "exclusionList")


	if sentenced is not None:


	content.append(sentenced)
	#print("origine=> "+sentence)
	total_sentences_number = len(sentences)
	# mean_sentence_length = int(round(number_of_words/total_sentences_number))
	# print(mean_sentence_length)

	for line in content:

	line = self.selectLines(line, "inclusionList")



	if line is not None:

	if re.match(include_link_pattern, line):
	include_links.append(line)
	#print(line)
	if line.count(',') == 0:
	output_content.append(line)
	# content.remove(line)
	if line.count(',') > 0:
	output_inter_content.append(line)
	content.remove(line)
	for s in content:
	# print(s, file_name)
	sentence = self.discardLines(s, "FilterS")
	if sentence is not None:
	if s.count(',') <= 2 and re.match(examplif_word_pattern, s.lower()):
	s = str(s)
	cs = s.lower()
	cs = re.sub(examplif_word_pattern, '', cs)
	cs = re.sub('which', 'this/these', cs)
	cs = re.sub(r'\.$', '', cs)
	#print(s)
	if cs.count(',') == 1 and cs.count('such as')==0:
	ex_output_content_linked.append(cs)
	else:
	ex_output_content.append(cs)
	elif s.count(',') == 1:
	s = str(s)
	s = s.lower()
	s = self.selectLines(s, "OneCommaDiscriminator")
	if s is not None:
	#s = re.sub('which', 'this/these', s)
	#print(s)
	s = re.sub(r'^thus, ', '', s)
	s = re.sub(r'^preferably, ', '', s)
	s = re.sub(r'^conventional ', '', s)
	s = re.sub(r'^in particular, ', '', s)
	s = re.sub(r'^specifically, ', '', s)
	s = re.sub(r'^as necessary, ', '', s)
	s = re.sub(', which', ',this/these', s)
	s = re.sub(r'\.$', '', s)

	if s.count(',')==1:
	ex_output_content_linked.append(s)
	else:
	ex_output_content.append(s)
	else:
	pass

	print(len(ex_output_content_linked))
	ex_output_content_linked = list(set(ex_output_content_linked))
	for line in ex_output_content_linked:
	line = line.lower()
	if 'figure' not in line:
	#if line.count(',') <= 1:
	t_sline_ex = line.strip().split(',')
	#print("outpib"+str(t_sline_ex))
	for concept in t_sline_ex:
	#print("outpib" + str(concept))
	words = nltk.word_tokenize(concept)
	tagged = nltk.pos_tag(words)
	#print(tagged)
	parameters_list = []
	compteur = 0
	compt_Id_ex += 1
	tagged = nltk.pos_tag(word_tokenize(concept))
	tags = [word for word, pos in tagged if pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
	if len(tags) < 1:
	continue
	# classifyT = SentenceClassifier(concept)
	# polarite = classifyT.classifySentence()
	classifyT = ClassifyWithIncr_it()
	polarite = classifyT.main(concept)
	# if polarite == 'neutre':
	# classify = SentenceClassifier(concept)
	# polarite = classify.classifySentence()
	# print(concept)

	get_parameters = ParameterExtractor(concept)
	parameters = get_parameters.extract_parameters()

	parameters_list.extend( parameters)
	# parameters_list=", ".join(parameters_list)
	# parameters_list = parameters_list
	#print("Index is: ")
	#print(t_sline_ex.index(concept))
	#print(concept)

	clean_concept = self.last_cleansing(concept)
	# if polarite == 'neutre':
	# words = word_tokenize(clean_concept)
	# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
	# noise_trash.append(hit)

	validity = self.discardLines(concept, 'referencing_indices')
	if t_sline_ex.index(concept) == 0 and validity is not None:
	previous_polarity = polarite
	values = OrderedDict({
	"concept": {
	"type": polarite,
	"enfants": graphItemIdValue + str(compt_Id_ex + 1),
	"id": graphItemIdValue + str(compt_Id_ex),
	"sentence": clean_concept,
	"source": output_file_name,
	"parameters":parameters_list,
	"image": urlImg,
	"pdf": urlPDF
	}

	})

	else:
	print("Previous polarity is : " + str(previous_polarity))
	if previous_polarity =='partialSolution' or validity is None:
	continue
	else:
	compteur += 1
	values = OrderedDict({
	"concept": {
	"type": polarite,
	"parents": graphItemIdValue + str(compt_Id_ex - 1),
	"id": graphItemIdValue + str(compt_Id_ex),
	"sentence": clean_concept,
	"source": output_file_name,
	"parameters": parameters_list,
	"image": urlImg,
	"pdf": urlPDF

	}

	})

	json_string_linkes = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))

	output_result.append(json_string_linkes)



	#for line in output_content:
	#print ("include=> "+line)
	#just examplification sentences
	#make a function of that
	ex_output_content = list(set(ex_output_content))
	for concept in ex_output_content:
	tagged = nltk.pos_tag(word_tokenize(concept))
	tags = [word for word, pos in tagged if
	pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
	if len(tags) < 1:
	continue
	parameters_list = []
	concept = concept.lower()
	compt_Id_ex += 1
	# classify = SentenceClassifier(sline)
	# polarite = classify.classifySentence()
	classifyT = ClassifyWithIncr_it()
	polarite = classifyT.main(concept)

	# if polarite =='neutre':
	# classify = SentenceClassifier(concept)
	# polarite = classify.classifySentence()
	# print(sline)

	#if polarite == 'partialSolution':
	#print(sline)
	#Insert a classifier here
	get_parameters = ParameterExtractor(concept)
	parameters = get_parameters.extract_parameters()

	clean_concept = self.last_cleansing(concept)
	parameters_list.extend(parameters)
	# if polarite == 'neutre':
	# words = word_tokenize(clean_concept)
	# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
	# noise_trash.append(hit)
	# parameters_list = ", ".join(parameters_list)
	validity = self.discardLines(concept, 'referencing_indices')
	if polarite != 'partialSolution' and validity is not None:

	values = OrderedDict({
	"concept": {
	"type": polarite,
	"id": graphItemIdValue + str(compt_Id_ex),
	"sentence": clean_concept,
	"source": output_file_name,
	"parameters": parameters_list,
	"image": urlImg,
	"pdf": urlPDF


	}

	})
	json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
	output_result.append(json_string)



	for line in include_links:
	#print(line)
	#Put in lower case to improve matching
	line = line.lower()

	if re.match(r'however', line) and line.count(',') <= 1:
	line = str(line)
	sline = re.sub(r'however\|,', '', line)
	if sline not in output_linked_content:
	output_linked_content.append(sline)
	if re.match(r'however', line) and line.count(',') > 1:
	sline = re.sub(r'^however,?(\s\w+)\s*, that ', '', line)
	# sline = re.sub(r'however,.+, that ', '', sline)
	sline = re.sub(r'^however,?(\s\w+)+\s(above), ', '', sline)
	sline = re.sub(r'^however,?\s\w+ed(\s\w+)+,\s*', '', sline)
	sline = re.sub(r'^however,?\sif\s(desired\|said)\s*,\s', '', sline)
	sline = re.sub(r'^however,?\s(it)\s(will be appreciated)\s,\s(that)+\s', '', sline)
	sline = re.sub(r'^however,?\s(as\|if\|because\|when\|since)\s*(?!is)', '', sline)
	sline = re.sub(r'^however,?\s*', '', sline)
	if sline not in output_linked_content:
	output_linked_content.append(sline)
	if re.match(r'if', line) and line.count(',') <= 1:
	line = str(line)
	sline = re.sub(r'^if\s?(and when\|not\|desired\|necessary)\s?,?\s*', '', line)
	sline = re.sub(r'^if,?\s*', '', sline)
	sline = re.sub(r'^if ', '', sline)
	if sline not in output_linked_content:
	output_linked_content.append(sline)
	# print (sline)

	if re.match(r'when', line):
	line = str(line)
	line = line.lower()
	sline = re.sub(r'^when\s*', '', line)
	sline = re.sub(r'^when,?\s*', '', sline)
	sline = re.sub(r'^when ', '', sline)
	if sline not in output_linked_content:
	output_linked_content.append(sline)
	if re.match(r'(^since)\|(^\w+\s?,\s?since\s?)', line):
	sline = re.sub(r'^since', '', line)
	sline = re.sub(r'^\w+\s?,\s?since\s?', '', sline)
	if sline not in output_linked_content:
	output_linked_content.append(sline)

	for line in output_content:
	line = line.lower()
	if re.match(r'if', line):
	line = str(line)
	sline = re.sub(r'^if ', '', line)
	if sline not in output_linked_content:
	output_content.append(sline)
	#output_content.remove(line)

	uniq_output_linked_content = list(set(output_linked_content))
	for line in uniq_output_linked_content:
	#print("long sentences = > " + line)
	# line = str(i)
	#print(line)
	line = line.lower()
	if 'figure' in line:
	uniq_output_linked_content.remove(line)
	sline = re.sub(r'^\s+', '', line)
	sline = re.sub(r'^\d+\.+$', '', sline)

	if sline.count(',') <= 1:
	t_sline = tuple(sline.strip().split(', '))
	#print("outpib"+str(t_sline))
	for concept in t_sline:
	tagged = nltk.pos_tag(word_tokenize(concept))
	tags = [word for word, pos in tagged if
	pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
	if len(tags) < 1:
	continue
	else:
	parameters_list = []
	compteur = 0
	compt_Id += 1
	# classifyT = SentenceClassifier(concept)
	# polarite = classifyT.classifySentence()
	tagged = nltk.pos_tag(word_tokenize(concept))
	tags = [word for word, pos in tagged if pos.startswith('V') or pos == 'JJR']
	if len(tags) < 1:
	continue
	classifyT = ClassifyWithIncr_it()
	polarite = classifyT.main(concept)


	# if polarite == 'neutre':
	# classify = SentenceClassifier(concept)
	# polarite = classify.classifySentence()
	# print(concept)

	get_parameters = ParameterExtractor(concept)
	parameters = get_parameters.extract_parameters()

	parameters_list.extend( parameters)
	# parameters_list=", ".join(parameters_list)
	# parameters_list = parameters_list

	clean_concept = self.last_cleansing(concept)
	validity = self.discardLines(concept, 'referencing_indices')
	# if polarite == 'neutre':
	# words = word_tokenize(clean_concept)
	# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
	# noise_trash.append(hit)


	if t_sline.index(concept) == 0 and validity is not None:
	previous_polarity = polarite
	values = OrderedDict({
	"concept": {
	"type": polarite,
	"enfants": graphItemIdValue + str(compt_Id + 1),
	"id": graphItemIdValue + str(compt_Id),
	"sentence": clean_concept,
	"source": output_file_name,
	"parameters":parameters_list,
	"image": urlImg,
	"pdf": urlPDF
	}

	})

	else:
	print("Previous polarity is : " + str(previous_polarity))
	if previous_polarity =='partialSolutiond' or validity is None:
	continue
	else:
	compteur += 1
	values = OrderedDict({
	"concept": {
	"type": polarite,
	"parents": graphItemIdValue + str(compt_Id - 1),
	"id": graphItemIdValue + str(compt_Id),
	"sentence": clean_concept,
	"source": output_file_name,
	"parameters": parameters_list,
	"image": urlImg,
	"pdf": urlPDF

	}

	})

	json_string_linked = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))

	output_result.append(json_string_linked)


	uniq_output_content = list(set(output_content))
	for s in uniq_output_content:
	for y in uniq_output_content:
	if s != y:
	result = Levenshtein.ratio(s, y)
	if result > .7:
	# print(s + " :IS SIMILAR TO: " + y)
	if len(s) > len(y):
	uniq_output_content.remove(y)
	elif len(y) < len(s):
	uniq_output_content.remove(s)


	for concept in uniq_output_content:
	tagged = nltk.pos_tag(word_tokenize(concept))
	tags = [word for word, pos in tagged if
	pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR']
	if len(tags) < 1:
	continue
	parameters_list = []
	concept = concept.lower()
	compt_Id += 1
	sline = re.sub(r'^if ', '', concept)
	sline = re.sub(r'^(if\|preferably) ', '', sline)
	sline = re.sub(r'^\s+?said ', '', sline)
	# classify = SentenceClassifier(sline)
	# polarite = classify.classifySentence()
	classifyT = ClassifyWithIncr_it()
	polarite = classifyT.main(concept)
	# if polarite =='neutre':
	# classify = SentenceClassifier(sline)
	# polarite = classify.classifySentence()
	# print(sline)

	#if polarite == 'partialSolution':
	#print(sline)
	#Insert a classifier here
	get_parameters = ParameterExtractor(concept)
	parameters = get_parameters.extract_parameters()

	parameters_list.extend(parameters)
	# parameters_list = ", ".join(parameters_list)
	clean_concept = self.last_cleansing(sline)
	# if polarite == 'neutre':
	# words = word_tokenize(clean_concept)
	# hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)])
	# noise_trash.append(hit)

	validity = self.discardLines(concept, 'referencing_indices')
	if polarite !='partialSolution' and validity is not None:

	values = OrderedDict({
	"concept": {
	"type": polarite,
	"id": graphItemIdValue + str(compt_Id),
	"sentence": clean_concept,
	"source": output_file_name,
	"parameters": parameters_list,
	"image": urlImg,
	"pdf": urlPDF
	}

	})
	json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': '))
	output_result.append(json_string)
	output_result = list(set(output_result))





	output_json = ",".join(output_result)
	return output_json, total_sentences_number