Spaces:
Runtime error
Runtime error
File size: 1,205 Bytes
d1ef404 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
import pymorphy2
class DataPreprocessor:
def __init__(self):
nltk.download('stopwords')
self.morph = pymorphy2.MorphAnalyzer()
self.tokenizer = WordPunctTokenizer()
self.punctuation = set(string.punctuation)
self.stopwords_russian = stopwords.words("russian")
self.stop_tokens = (set(self.stopwords_russian) - {'и', 'или', 'не'}).union(self.punctuation)
def tokenize_data(self, texts):
tokens = [self.tokenizer.tokenize(str(text).lower()) for text in texts]
return tokens
def lemmatize_tokens_string(self, tokens_string):
new_tokens = []
for token in tokens_string:
if token not in self.stop_tokens:
new_tokens.append(self.morph.parse(token)[0].normal_form)
return new_tokens
def lemmatize_tokens(self, tokens):
for i in range(len(tokens)):
tokens[i] = self.lemmatize_tokens_string(tokens[i])
def preprocess_texts(self, texts):
tokens = self.tokenize_data(texts)
self.lemmatize_tokens(tokens)
return tokens
|