Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| import re | |
| import nltk | |
| import json | |
| from App.bin import constants | |
| class FiguresCleaner(object): | |
| def __init__(self, sections): | |
| self.sections = sections | |
| def clean_figures(self): | |
| sections = self.sections | |
| clean_content = [] | |
| with open(constants.ASSETS + "wordAfterNumber", 'r') as l: | |
| after_words = l.read().splitlines() | |
| after_words_patterns = re.compile('|'.join(after_words)) | |
| with open(constants.ASSETS + "wordBeforeNumber", 'r') as l: | |
| before_words = l.read().splitlines() | |
| before_words_patterns = re.compile('|'.join(before_words)) | |
| #sections = sections.splitlines() | |
| words = nltk.word_tokenize(sections) | |
| tagged_words = nltk.pos_tag(words) | |
| for i in range(len(tagged_words)): | |
| if i < len(tagged_words) - 1: | |
| next_word = tagged_words[i + 1][0] | |
| current_word = tagged_words[i][0] | |
| previous_word = tagged_words[i - 1][0] | |
| currentWordTag = tagged_words[i][1] | |
| if currentWordTag == 'CD' and not re.match(after_words_patterns, | |
| next_word) is not None and not re.match( | |
| before_words_patterns, previous_word) is not None: | |
| if re.search(r'\d', current_word) is not None: | |
| continue | |
| else: | |
| clean_content.append(current_word + " ") | |
| else: | |
| clean_content.append("\n") | |
| return clean_content |