import urllib import html import re from urlextract import URLExtract from unicodedata import normalize from .demojize import demojize def hashtag_handler(text: str): pattern = r"(#([^\s]+))" return re.sub(pattern, " \\2 ", text) def cashtag_handler(text: str): pattern = r"(\$([^\s]+))" return re.sub(pattern, " \\2 ", text) def mention_handler(text: str): pattern = r"(@([^\s]+))" return re.sub(pattern, " @user ", text) url_extractor = URLExtract() def url_handler(text: str): urls = list(url_extractor.gen_urls(text)) updated_urls = list( set([url if "http" in url else f"https://{url}" for url in urls]) ) domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls] for i in range(len(domains)): text = text.replace(urls[i], f" {domains[i]} ") return text def email_handler(text: str): pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+" match = re.findall(pattern, text) for m in match: text = text.replace(m, " ").strip() return text def emoji_handler(text: str): return demojize(text, language="tr", delimiters=(" ", " ")) def normalize_text(text: str): return normalize("NFC", text) def preprocess(text: str): output = html.unescape(text) output = normalize_text(output) output = email_handler(output) output = url_handler(output) output = hashtag_handler(output) output = cashtag_handler(output) output = mention_handler(output) output = emoji_handler(output) output = re.sub(r"\s+", " ", output) output = output.lower() output = output.strip() return output if __name__ == "__main__": sample_text = "" preprocessed_text = preprocess(sample_text) print(preprocessed_text)