import os import re import unicodedata def strip_accents(text: str) -> str: """Removes accents from text.""" return ''.join(c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn') def load_raw_text(corpus_directory: str, file_names=None) -> str: """Loads all the text files in a directory into one large string""" corpus = "" for file_name in os.listdir(corpus_directory): # Read the file as a string file_path = os.path.join(corpus_directory, file_name) if os.path.isdir(file_path): continue # Make sure we only read text files if ".txt" not in file_name: continue with open(file_path, 'r') as file: file_contents = corpus += (file_contents + "\n") return corpus def load_single_raw_text_file(file_name): """Loads a single text file into one large string""" corpus = "" with open(file_name, 'r') as file: file_contents = corpus += (file_contents + "\n") return corpus word_regex = r"[\w|\']+" def tokenize(text): return re.findall(word_regex, text) def preprocess(text): """Tokenizes and processes text which is already separated by spaces into words. Designed for English punctuation.""" text = strip_accents(text) text = text.lower() tokens = text.split(" ") tokens_filtered = [] for token in tokens: # Skip any tokens with special characters if re.match(r"[\w|\']+|[\.|\,|\?|\!]", token): tokens_filtered.append(token) return tokens_filtered def pad(text: list, num_padding: int): """Pads the given text, as a list of strings, with <s> characters between sentences.""" padded_text = [] # Add initial padding to the first sentence for _ in range(num_padding): padded_text.append("<s>") for word in text: padded_text.append(word) # Every time we see an end punctuation mark, add <s> tokens before it # REPLACE IF YOUR LANGUAGE USES DIFFERENT END PUNCTUATION if word in [".", "?", "!"]: for _ in range(num_padding): padded_text.append("<s>") return padded_text