AliNajafi's picture
Add application files
d856fda
import urllib
import html
import re
from urlextract import URLExtract
from unicodedata import normalize
from .demojize import demojize
def hashtag_handler(text: str):
pattern = r"(#([^\s]+))"
return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text)
def cashtag_handler(text: str):
pattern = r"(\$([^\s]+))"
return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text)
def mention_handler(text: str):
pattern = r"(@([^\s]+))"
return re.sub(pattern, " @user ", text)
url_extractor = URLExtract()
def url_handler(text: str):
urls = list(url_extractor.gen_urls(text))
updated_urls = list(
set([url if "http" in url else f"https://{url}" for url in urls])
)
domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls]
for i in range(len(domains)):
text = text.replace(urls[i], f" <http> {domains[i]} </http> ")
return text
def email_handler(text: str):
pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
match = re.findall(pattern, text)
for m in match:
text = text.replace(m, " <email> ").strip()
return text
def emoji_handler(text: str):
return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> "))
def normalize_text(text: str):
return normalize("NFC", text)
def preprocess(text: str):
output = html.unescape(text)
output = normalize_text(output)
output = email_handler(output)
output = url_handler(output)
output = hashtag_handler(output)
output = cashtag_handler(output)
output = mention_handler(output)
output = emoji_handler(output)
output = re.sub(r"\s+", " ", output)
output = output.lower()
output = output.strip()
return output
if __name__ == "__main__":
sample_text = ""
preprocessed_text = preprocess(sample_text)
print(preprocessed_text)