Remove unused code
Browse files- transforms_cased.py +2 -28
transforms_cased.py
CHANGED
@@ -160,14 +160,12 @@ class FilterPOS(BaseTextTransform):
|
|
160 |
Args:
|
161 |
tags (list): List of POS tags to remove.
|
162 |
engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
|
163 |
-
keep_compound_nouns (bool): Whether to keep composed words. Defaults to True.
|
164 |
"""
|
165 |
|
166 |
-
def __init__(self, tags: list, engine: str = "nltk"
|
167 |
super().__init__()
|
168 |
self.tags = tags
|
169 |
self.engine = engine
|
170 |
-
self.keep_compound_nouns = keep_compound_nouns
|
171 |
|
172 |
if engine == "nltk":
|
173 |
nltk.download("averaged_perceptron_tagger", quiet=True)
|
@@ -189,30 +187,6 @@ class FilterPOS(BaseTextTransform):
|
|
189 |
self.tagger(sentence)
|
190 |
text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
|
191 |
|
192 |
-
if self.keep_compound_nouns:
|
193 |
-
compound_nouns = []
|
194 |
-
|
195 |
-
if self.engine == "nltk":
|
196 |
-
for i in range(len(word_tags) - 1):
|
197 |
-
if word_tags[i][1] == "NN" and word_tags[i + 1][1] == "NN":
|
198 |
-
# if they are the same word, skip
|
199 |
-
if word_tags[i][0] == word_tags[i + 1][0]:
|
200 |
-
continue
|
201 |
-
|
202 |
-
compound_noun = word_tags[i][0] + "_" + word_tags[i + 1][0]
|
203 |
-
compound_nouns.append(compound_noun)
|
204 |
-
elif self.engine == "flair":
|
205 |
-
for i in range(len(sentence.tokens) - 1):
|
206 |
-
if sentence.tokens[i].tag == "NN" and sentence.tokens[i + 1].tag == "NN":
|
207 |
-
# if they are the same word, skip
|
208 |
-
if sentence.tokens[i].text == sentence.tokens[i + 1].text:
|
209 |
-
continue
|
210 |
-
|
211 |
-
compound_noun = sentence.tokens[i].text + "_" + sentence.tokens[i + 1].text
|
212 |
-
compound_nouns.append(compound_noun)
|
213 |
-
|
214 |
-
text = " ".join([text, " ".join(compound_nouns)])
|
215 |
-
|
216 |
return text
|
217 |
|
218 |
def __repr__(self) -> str:
|
@@ -396,7 +370,7 @@ def default_vocabulary_transforms() -> TextCompose:
|
|
396 |
transforms.append(ToSingular())
|
397 |
transforms.append(DropWords(words=words_to_drop))
|
398 |
transforms.append(FrequencyMinWordCount(min_count=2))
|
399 |
-
transforms.append(FilterPOS(tags=pos_tags, engine="flair"
|
400 |
transforms.append(RemoveDuplicates())
|
401 |
|
402 |
transforms = TextCompose(transforms)
|
|
|
160 |
Args:
|
161 |
tags (list): List of POS tags to remove.
|
162 |
engine (str): POS tagger to use. Must be one of "nltk" or "flair". Defaults to "nltk".
|
|
|
163 |
"""
|
164 |
|
165 |
+
def __init__(self, tags: list, engine: str = "nltk") -> None:
|
166 |
super().__init__()
|
167 |
self.tags = tags
|
168 |
self.engine = engine
|
|
|
169 |
|
170 |
if engine == "nltk":
|
171 |
nltk.download("averaged_perceptron_tagger", quiet=True)
|
|
|
187 |
self.tagger(sentence)
|
188 |
text = " ".join([token.text for token in sentence.tokens if token.tag in self.tags])
|
189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
return text
|
191 |
|
192 |
def __repr__(self) -> str:
|
|
|
370 |
transforms.append(ToSingular())
|
371 |
transforms.append(DropWords(words=words_to_drop))
|
372 |
transforms.append(FrequencyMinWordCount(min_count=2))
|
373 |
+
transforms.append(FilterPOS(tags=pos_tags, engine="flair"))
|
374 |
transforms.append(RemoveDuplicates())
|
375 |
|
376 |
transforms = TextCompose(transforms)
|