Spaces:
Sleeping
Sleeping
import urllib | |
import html | |
import re | |
from urlextract import URLExtract | |
from unicodedata import normalize | |
from .demojize import demojize | |
def hashtag_handler(text: str): | |
pattern = r"(#([^\s]+))" | |
return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text) | |
def cashtag_handler(text: str): | |
pattern = r"(\$([^\s]+))" | |
return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text) | |
def mention_handler(text: str): | |
pattern = r"(@([^\s]+))" | |
return re.sub(pattern, " @user ", text) | |
url_extractor = URLExtract() | |
def url_handler(text: str): | |
urls = list(url_extractor.gen_urls(text)) | |
updated_urls = list( | |
set([url if "http" in url else f"https://{url}" for url in urls]) | |
) | |
domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls] | |
for i in range(len(domains)): | |
text = text.replace(urls[i], f" <http> {domains[i]} </http> ") | |
return text | |
def email_handler(text: str): | |
pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+" | |
match = re.findall(pattern, text) | |
for m in match: | |
text = text.replace(m, " <email> ").strip() | |
return text | |
def emoji_handler(text: str): | |
return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> ")) | |
def normalize_text(text: str): | |
return normalize("NFC", text) | |
def preprocess(text: str): | |
output = html.unescape(text) | |
output = normalize_text(output) | |
output = email_handler(output) | |
output = url_handler(output) | |
output = hashtag_handler(output) | |
output = cashtag_handler(output) | |
output = mention_handler(output) | |
output = emoji_handler(output) | |
output = re.sub(r"\s+", " ", output) | |
output = output.lower() | |
output = output.strip() | |
return output | |
if __name__ == "__main__": | |
sample_text = "" | |
preprocessed_text = preprocess(sample_text) | |
print(preprocessed_text) | |