Spaces:
Sleeping
Sleeping
File size: 1,924 Bytes
d856fda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import urllib
import html
import re
from urlextract import URLExtract
from unicodedata import normalize
from .demojize import demojize
def hashtag_handler(text: str):
pattern = r"(#([^\s]+))"
return re.sub(pattern, " <hashtag> \\2 </hashtag> ", text)
def cashtag_handler(text: str):
pattern = r"(\$([^\s]+))"
return re.sub(pattern, " <cashtag> \\2 </cashtag> ", text)
def mention_handler(text: str):
pattern = r"(@([^\s]+))"
return re.sub(pattern, " @user ", text)
url_extractor = URLExtract()
def url_handler(text: str):
urls = list(url_extractor.gen_urls(text))
updated_urls = list(
set([url if "http" in url else f"https://{url}" for url in urls])
)
domains = [urllib.parse.urlparse(url_text).netloc for url_text in updated_urls]
for i in range(len(domains)):
text = text.replace(urls[i], f" <http> {domains[i]} </http> ")
return text
def email_handler(text: str):
pattern = r"[\w.+-]+@[\w-]+\.[\w.-]+"
match = re.findall(pattern, text)
for m in match:
text = text.replace(m, " <email> ").strip()
return text
def emoji_handler(text: str):
return demojize(text, language="tr", delimiters=(" <emoji> ", " </emoji> "))
def normalize_text(text: str):
return normalize("NFC", text)
def preprocess(text: str):
output = html.unescape(text)
output = normalize_text(output)
output = email_handler(output)
output = url_handler(output)
output = hashtag_handler(output)
output = cashtag_handler(output)
output = mention_handler(output)
output = emoji_handler(output)
output = re.sub(r"\s+", " ", output)
output = output.lower()
output = output.strip()
return output
if __name__ == "__main__":
sample_text = ""
preprocessed_text = preprocess(sample_text)
print(preprocessed_text)
|