Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| #java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer --port 8080 | |
| import nltk | |
| nltk.download('all') | |
| import os | |
| import re | |
| import json | |
| import hashlib | |
| import Levenshtein | |
| import uuid | |
| from App.bin import constants | |
| from collections import OrderedDict | |
| from nltk import word_tokenize | |
| from App.bin.SharpClassifier import SharpClassifier | |
| from App.bin.ClassifierWithIncr import ClassifyWithIncr_it | |
| from App.bin.SentenceClassifier import SentenceClassifier | |
| from App.bin.ParameterExtractor import ParameterExtractor | |
| class InformationExtractor(object): | |
| patent_abbreviations = open(constants.ASSETS + "abbreviation_sentence_splitter").read().split() | |
| sentence_finder = nltk.data.load('tokenizers/punkt/english.pickle') | |
| sentence_finder._params.abbrev_types.update(patent_abbreviations) | |
| def __init__(self, section, input_folder,file_extension, file_name): | |
| self.section = section | |
| self.input_folder = input_folder | |
| self.file_extension = file_extension | |
| self.file_name = file_name | |
| print("Extracting problem graph") | |
| #@staticmethod | |
| def discardLines(self, line,lexic): | |
| with open (constants.ASSETS+ lexic) as m: | |
| exclusion_list = m.read().splitlines() | |
| if any(word in line for word in exclusion_list): | |
| pass | |
| else: | |
| return line | |
| def selectLines(self, line, lexic): | |
| with open(constants.ASSETS + lexic) as n: | |
| inclusion_list = n.read().splitlines() | |
| if any(word in line for word in inclusion_list): | |
| return line | |
| def last_cleansing(self, concept): | |
| concept = str(concept) | |
| concept = concept.lower() | |
| if concept.endswith("."): | |
| concept = concept.strip(".") | |
| concept = re.sub(r'^consequently ','', concept) | |
| concept = re.sub(r'^such ', '', concept) | |
| concept = re.sub(r'^said ', '', concept) | |
| concept = re.sub(r'^\s+', '', concept) | |
| concept = re.sub(r'^it is worth noting that ', '', concept) | |
| concept = re.sub(r'^example of ', '', concept) | |
| concept = re.sub(r'^since ', '', concept) | |
| concept = re.sub(r'^\( |\)$ ', '', concept) | |
| return concept | |
| # def get_from_claims(self): | |
| # | |
| # section = self.section | |
| # content = [] | |
| # sentence_finder = InformationExtractor.sentence_finder | |
| # sentences = sentence_finder.tokenize(section.strip()) | |
| # with open(constants.ASSETS + "getFromClaims") as concept: | |
| # # next(concept) | |
| # included_words = concept.read().splitlines() | |
| # include_link_pattern = re.compile('|'.join(included_words)) | |
| def get_from_description(self): | |
| previous_polarity = '' | |
| noise_trash =[] | |
| content = [] | |
| include_links = [] | |
| output_content = [] | |
| ex_output_content = [] | |
| output_result=[] | |
| output_linked_content = [] | |
| output_inter_content = [] | |
| uniq_output_linked_content =[] | |
| ex_output_content_linked =[] | |
| section = self.section | |
| input_folder = self.input_folder | |
| file_name = self.file_name | |
| file_extension = self.file_extension | |
| projectFolder = os.path.basename(os.path.normpath(input_folder)) | |
| output_file_name = input_folder+"/"+file_name+file_extension.strip("*") | |
| graphItemId = hashlib.md5(file_name.encode()) | |
| graphItemIdValue = graphItemId.hexdigest() | |
| graphItemIdValue = str(uuid.uuid4()) | |
| t_sline = "" | |
| t_sline_ex =[] | |
| compt_Id = 30 | |
| compt_Id_ex = 40 | |
| root_img_url = 'https://worldwide.espacenet.com/espacenetImage.jpg?flavour=firstPageClipping&locale=en_EP&FT=D&' | |
| root_pdf_url = 'https://worldwide.espacenet.com/publicationDetails/originalDocument?' | |
| if file_name is not None: | |
| match = re.search('(^[a-zA-Z]+)(([0-9]+)\s?([a-zA-Z0-9_]+$))', file_name) | |
| # CC for country code | |
| CC = match.group(1) | |
| # NR for Number | |
| NR = match.group(2) | |
| NR = re.sub(r'\s', '', NR) | |
| # KC for Kind code | |
| KC = match.group(4) | |
| urlImg = root_img_url + '&CC=' + CC + '&NR=' + NR + '&KC=' + KC | |
| urlPDF = root_pdf_url + 'CC=' + CC + '&NR=' + NR + '&KC=' + KC + '&FT=D&ND=3&date=' + '&DB=&locale=en_EP#' | |
| sentence_finder = InformationExtractor.sentence_finder | |
| #section = self.dataCleaner(section) | |
| #print(section) | |
| sentences = sentence_finder.tokenize(section.strip()) | |
| with open(constants.ASSETS + "includeLinks") as concept: | |
| # next(concept) | |
| included_words = concept.read().splitlines() | |
| include_link_pattern = re.compile('|'.join(included_words)) | |
| #open examplification wordfile | |
| with open(constants.ASSETS + "examplificationclues") as examplif: | |
| # next(concept) | |
| exam_words = examplif.read().splitlines() | |
| examplif_word_pattern = re.compile('|'.join(exam_words)) | |
| description_sentences_number = len(sentences) | |
| number_of_words = 0 | |
| for sentence in sentences: | |
| # with open(constants.DATA + 'sentences.txt', 'a', encoding='utf8') as file_handler: | |
| # for item in sentences: | |
| # file_handler.write("{}\n".format(item)) | |
| number_of_word = len(nltk.word_tokenize(sentence)) | |
| number_of_words += number_of_word | |
| sentenced = self.discardLines(sentence, "exclusionList") | |
| if sentenced is not None: | |
| content.append(sentenced) | |
| #print("origine=> "+sentence) | |
| total_sentences_number = len(sentences) | |
| # mean_sentence_length = int(round(number_of_words/total_sentences_number)) | |
| # print(mean_sentence_length) | |
| for line in content: | |
| line = self.selectLines(line, "inclusionList") | |
| if line is not None: | |
| if re.match(include_link_pattern, line): | |
| include_links.append(line) | |
| #print(line) | |
| if line.count(',') == 0: | |
| output_content.append(line) | |
| # content.remove(line) | |
| if line.count(',') > 0: | |
| output_inter_content.append(line) | |
| content.remove(line) | |
| for s in content: | |
| # print(s, file_name) | |
| sentence = self.discardLines(s, "FilterS") | |
| if sentence is not None: | |
| if s.count(',') <= 2 and re.match(examplif_word_pattern, s.lower()): | |
| s = str(s) | |
| cs = s.lower() | |
| cs = re.sub(examplif_word_pattern, '', cs) | |
| cs = re.sub('which', 'this/these', cs) | |
| cs = re.sub(r'\.$', '', cs) | |
| #print(s) | |
| if cs.count(',') == 1 and cs.count('such as')==0: | |
| ex_output_content_linked.append(cs) | |
| else: | |
| ex_output_content.append(cs) | |
| elif s.count(',') == 1: | |
| s = str(s) | |
| s = s.lower() | |
| s = self.selectLines(s, "OneCommaDiscriminator") | |
| if s is not None: | |
| #s = re.sub('which', 'this/these', s) | |
| #print(s) | |
| s = re.sub(r'^thus, ', '', s) | |
| s = re.sub(r'^preferably, ', '', s) | |
| s = re.sub(r'^conventional ', '', s) | |
| s = re.sub(r'^in particular, ', '', s) | |
| s = re.sub(r'^specifically, ', '', s) | |
| s = re.sub(r'^as necessary, ', '', s) | |
| s = re.sub(', which', ',this/these', s) | |
| s = re.sub(r'\.$', '', s) | |
| if s.count(',')==1: | |
| ex_output_content_linked.append(s) | |
| else: | |
| ex_output_content.append(s) | |
| else: | |
| pass | |
| print(len(ex_output_content_linked)) | |
| ex_output_content_linked = list(set(ex_output_content_linked)) | |
| for line in ex_output_content_linked: | |
| line = line.lower() | |
| if 'figure' not in line: | |
| #if line.count(',') <= 1: | |
| t_sline_ex = line.strip().split(',') | |
| #print("outpib"+str(t_sline_ex)) | |
| for concept in t_sline_ex: | |
| #print("outpib" + str(concept)) | |
| words = nltk.word_tokenize(concept) | |
| tagged = nltk.pos_tag(words) | |
| #print(tagged) | |
| parameters_list = [] | |
| compteur = 0 | |
| compt_Id_ex += 1 | |
| tagged = nltk.pos_tag(word_tokenize(concept)) | |
| tags = [word for word, pos in tagged if pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] | |
| if len(tags) < 1: | |
| continue | |
| # classifyT = SentenceClassifier(concept) | |
| # polarite = classifyT.classifySentence() | |
| classifyT = ClassifyWithIncr_it() | |
| polarite = classifyT.main(concept) | |
| # if polarite == 'neutre': | |
| # classify = SentenceClassifier(concept) | |
| # polarite = classify.classifySentence() | |
| # print(concept) | |
| get_parameters = ParameterExtractor(concept) | |
| parameters = get_parameters.extract_parameters() | |
| parameters_list.extend( parameters) | |
| # parameters_list=", ".join(parameters_list) | |
| # parameters_list = parameters_list | |
| #print("Index is: ") | |
| #print(t_sline_ex.index(concept)) | |
| #print(concept) | |
| clean_concept = self.last_cleansing(concept) | |
| # if polarite == 'neutre': | |
| # words = word_tokenize(clean_concept) | |
| # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) | |
| # noise_trash.append(hit) | |
| validity = self.discardLines(concept, 'referencing_indices') | |
| if t_sline_ex.index(concept) == 0 and validity is not None: | |
| previous_polarity = polarite | |
| values = OrderedDict({ | |
| "concept": { | |
| "type": polarite, | |
| "enfants": graphItemIdValue + str(compt_Id_ex + 1), | |
| "id": graphItemIdValue + str(compt_Id_ex), | |
| "sentence": clean_concept, | |
| "source": output_file_name, | |
| "parameters":parameters_list, | |
| "image": urlImg, | |
| "pdf": urlPDF | |
| } | |
| }) | |
| else: | |
| print("Previous polarity is : " + str(previous_polarity)) | |
| if previous_polarity =='partialSolution' or validity is None: | |
| continue | |
| else: | |
| compteur += 1 | |
| values = OrderedDict({ | |
| "concept": { | |
| "type": polarite, | |
| "parents": graphItemIdValue + str(compt_Id_ex - 1), | |
| "id": graphItemIdValue + str(compt_Id_ex), | |
| "sentence": clean_concept, | |
| "source": output_file_name, | |
| "parameters": parameters_list, | |
| "image": urlImg, | |
| "pdf": urlPDF | |
| } | |
| }) | |
| json_string_linkes = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
| output_result.append(json_string_linkes) | |
| #for line in output_content: | |
| #print ("include=> "+line) | |
| #just examplification sentences | |
| #make a function of that | |
| ex_output_content = list(set(ex_output_content)) | |
| for concept in ex_output_content: | |
| tagged = nltk.pos_tag(word_tokenize(concept)) | |
| tags = [word for word, pos in tagged if | |
| pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] | |
| if len(tags) < 1: | |
| continue | |
| parameters_list = [] | |
| concept = concept.lower() | |
| compt_Id_ex += 1 | |
| # classify = SentenceClassifier(sline) | |
| # polarite = classify.classifySentence() | |
| classifyT = ClassifyWithIncr_it() | |
| polarite = classifyT.main(concept) | |
| # if polarite =='neutre': | |
| # classify = SentenceClassifier(concept) | |
| # polarite = classify.classifySentence() | |
| # print(sline) | |
| #if polarite == 'partialSolution': | |
| #print(sline) | |
| #Insert a classifier here | |
| get_parameters = ParameterExtractor(concept) | |
| parameters = get_parameters.extract_parameters() | |
| clean_concept = self.last_cleansing(concept) | |
| parameters_list.extend(parameters) | |
| # if polarite == 'neutre': | |
| # words = word_tokenize(clean_concept) | |
| # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) | |
| # noise_trash.append(hit) | |
| # parameters_list = ", ".join(parameters_list) | |
| validity = self.discardLines(concept, 'referencing_indices') | |
| if polarite != 'partialSolution' and validity is not None: | |
| values = OrderedDict({ | |
| "concept": { | |
| "type": polarite, | |
| "id": graphItemIdValue + str(compt_Id_ex), | |
| "sentence": clean_concept, | |
| "source": output_file_name, | |
| "parameters": parameters_list, | |
| "image": urlImg, | |
| "pdf": urlPDF | |
| } | |
| }) | |
| json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
| output_result.append(json_string) | |
| for line in include_links: | |
| #print(line) | |
| #Put in lower case to improve matching | |
| line = line.lower() | |
| if re.match(r'however', line) and line.count(',') <= 1: | |
| line = str(line) | |
| sline = re.sub(r'however|,', '', line) | |
| if sline not in output_linked_content: | |
| output_linked_content.append(sline) | |
| if re.match(r'however', line) and line.count(',') > 1: | |
| sline = re.sub(r'^however,?(\s\w+)\s*, that ', '', line) | |
| # sline = re.sub(r'however,.+, that ', '', sline) | |
| sline = re.sub(r'^however,?(\s\w+)+\s(above), ', '', sline) | |
| sline = re.sub(r'^however,?\s\w+ed(\s\w+)+,\s*', '', sline) | |
| sline = re.sub(r'^however,?\sif\s(desired|said)\s*,\s', '', sline) | |
| sline = re.sub(r'^however,?\s(it)\s(will be appreciated)\s*,\s(that)+\s*', '', sline) | |
| sline = re.sub(r'^however,?\s(as|if|because|when|since)\s*(?!is)', '', sline) | |
| sline = re.sub(r'^however,?\s*', '', sline) | |
| if sline not in output_linked_content: | |
| output_linked_content.append(sline) | |
| if re.match(r'if', line) and line.count(',') <= 1: | |
| line = str(line) | |
| sline = re.sub(r'^if\s?(and when|not|desired|necessary)\s?,?\s*', '', line) | |
| sline = re.sub(r'^if,?\s*', '', sline) | |
| sline = re.sub(r'^if ', '', sline) | |
| if sline not in output_linked_content: | |
| output_linked_content.append(sline) | |
| # print (sline) | |
| if re.match(r'when', line): | |
| line = str(line) | |
| line = line.lower() | |
| sline = re.sub(r'^when\s*', '', line) | |
| sline = re.sub(r'^when,?\s*', '', sline) | |
| sline = re.sub(r'^when ', '', sline) | |
| if sline not in output_linked_content: | |
| output_linked_content.append(sline) | |
| if re.match(r'(^since)|(^\w+\s?,\s?since\s?)', line): | |
| sline = re.sub(r'^since', '', line) | |
| sline = re.sub(r'^\w+\s?,\s?since\s?', '', sline) | |
| if sline not in output_linked_content: | |
| output_linked_content.append(sline) | |
| for line in output_content: | |
| line = line.lower() | |
| if re.match(r'if', line): | |
| line = str(line) | |
| sline = re.sub(r'^if ', '', line) | |
| if sline not in output_linked_content: | |
| output_content.append(sline) | |
| #output_content.remove(line) | |
| uniq_output_linked_content = list(set(output_linked_content)) | |
| for line in uniq_output_linked_content: | |
| #print("long sentences = > " + line) | |
| # line = str(i) | |
| #print(line) | |
| line = line.lower() | |
| if 'figure' in line: | |
| uniq_output_linked_content.remove(line) | |
| sline = re.sub(r'^\s+', '', line) | |
| sline = re.sub(r'^\d+\.+$', '', sline) | |
| if sline.count(',') <= 1: | |
| t_sline = tuple(sline.strip().split(', ')) | |
| #print("outpib"+str(t_sline)) | |
| for concept in t_sline: | |
| tagged = nltk.pos_tag(word_tokenize(concept)) | |
| tags = [word for word, pos in tagged if | |
| pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] | |
| if len(tags) < 1: | |
| continue | |
| else: | |
| parameters_list = [] | |
| compteur = 0 | |
| compt_Id += 1 | |
| # classifyT = SentenceClassifier(concept) | |
| # polarite = classifyT.classifySentence() | |
| tagged = nltk.pos_tag(word_tokenize(concept)) | |
| tags = [word for word, pos in tagged if pos.startswith('V') or pos == 'JJR'] | |
| if len(tags) < 1: | |
| continue | |
| classifyT = ClassifyWithIncr_it() | |
| polarite = classifyT.main(concept) | |
| # if polarite == 'neutre': | |
| # classify = SentenceClassifier(concept) | |
| # polarite = classify.classifySentence() | |
| # print(concept) | |
| get_parameters = ParameterExtractor(concept) | |
| parameters = get_parameters.extract_parameters() | |
| parameters_list.extend( parameters) | |
| # parameters_list=", ".join(parameters_list) | |
| # parameters_list = parameters_list | |
| clean_concept = self.last_cleansing(concept) | |
| validity = self.discardLines(concept, 'referencing_indices') | |
| # if polarite == 'neutre': | |
| # words = word_tokenize(clean_concept) | |
| # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) | |
| # noise_trash.append(hit) | |
| if t_sline.index(concept) == 0 and validity is not None: | |
| previous_polarity = polarite | |
| values = OrderedDict({ | |
| "concept": { | |
| "type": polarite, | |
| "enfants": graphItemIdValue + str(compt_Id + 1), | |
| "id": graphItemIdValue + str(compt_Id), | |
| "sentence": clean_concept, | |
| "source": output_file_name, | |
| "parameters":parameters_list, | |
| "image": urlImg, | |
| "pdf": urlPDF | |
| } | |
| }) | |
| else: | |
| print("Previous polarity is : " + str(previous_polarity)) | |
| if previous_polarity =='partialSolutiond' or validity is None: | |
| continue | |
| else: | |
| compteur += 1 | |
| values = OrderedDict({ | |
| "concept": { | |
| "type": polarite, | |
| "parents": graphItemIdValue + str(compt_Id - 1), | |
| "id": graphItemIdValue + str(compt_Id), | |
| "sentence": clean_concept, | |
| "source": output_file_name, | |
| "parameters": parameters_list, | |
| "image": urlImg, | |
| "pdf": urlPDF | |
| } | |
| }) | |
| json_string_linked = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
| output_result.append(json_string_linked) | |
| uniq_output_content = list(set(output_content)) | |
| for s in uniq_output_content: | |
| for y in uniq_output_content: | |
| if s != y: | |
| result = Levenshtein.ratio(s, y) | |
| if result > .7: | |
| # print(s + " :IS SIMILAR TO: " + y) | |
| if len(s) > len(y): | |
| uniq_output_content.remove(y) | |
| elif len(y) < len(s): | |
| uniq_output_content.remove(s) | |
| for concept in uniq_output_content: | |
| tagged = nltk.pos_tag(word_tokenize(concept)) | |
| tags = [word for word, pos in tagged if | |
| pos == 'VBZ' or pos == 'VBP' or pos == 'VBG' or pos == 'MD' or pos == 'JJR'] | |
| if len(tags) < 1: | |
| continue | |
| parameters_list = [] | |
| concept = concept.lower() | |
| compt_Id += 1 | |
| sline = re.sub(r'^if ', '', concept) | |
| sline = re.sub(r'^(if|preferably) ', '', sline) | |
| sline = re.sub(r'^\s+?said ', '', sline) | |
| # classify = SentenceClassifier(sline) | |
| # polarite = classify.classifySentence() | |
| classifyT = ClassifyWithIncr_it() | |
| polarite = classifyT.main(concept) | |
| # if polarite =='neutre': | |
| # classify = SentenceClassifier(sline) | |
| # polarite = classify.classifySentence() | |
| # print(sline) | |
| #if polarite == 'partialSolution': | |
| #print(sline) | |
| #Insert a classifier here | |
| get_parameters = ParameterExtractor(concept) | |
| parameters = get_parameters.extract_parameters() | |
| parameters_list.extend(parameters) | |
| # parameters_list = ", ".join(parameters_list) | |
| clean_concept = self.last_cleansing(sline) | |
| # if polarite == 'neutre': | |
| # words = word_tokenize(clean_concept) | |
| # hit = ' '.join([word + '/' + pos for word, pos in nltk.pos_tag(words)]) | |
| # noise_trash.append(hit) | |
| validity = self.discardLines(concept, 'referencing_indices') | |
| if polarite !='partialSolution' and validity is not None: | |
| values = OrderedDict({ | |
| "concept": { | |
| "type": polarite, | |
| "id": graphItemIdValue + str(compt_Id), | |
| "sentence": clean_concept, | |
| "source": output_file_name, | |
| "parameters": parameters_list, | |
| "image": urlImg, | |
| "pdf": urlPDF | |
| } | |
| }) | |
| json_string = json.dumps(values, sort_keys=OrderedDict, indent=4, separators=(',', ': ')) | |
| output_result.append(json_string) | |
| output_result = list(set(output_result)) | |
| output_json = ",".join(output_result) | |
| return output_json, total_sentences_number |