HamidRezaei's picture
Update app.py
400c7e3 verified
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
import torch
from cleantext import clean
import hazm
import re
def cleanhtml(raw_html):
cleanr = re.compile('<.*?>')
cleantext = re.sub(cleanr, '', raw_html)
return cleantext
def cleaning(text):
text = text.strip()
# regular cleaning
text = clean(text,
clean_all=True,
punct=True,
stopwords=True,
stemming=True,
extra_spaces=True
)
# cleaning htmls
text = cleanhtml(text)
# normalizing
normalizer = hazm.Normalizer()
text = normalizer.normalize(text)
# removing wierd patterns
wierd_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
u"\U0001f926-\U0001f937"
u'\U00010000-\U0010ffff'
u"\u200d"
u"\u2640-\u2642"
u"\u2600-\u2B55"
u"\u23cf"
u"\u23e9"
u"\u231a"
u"\u3030"
u"\ufe0f"
u"\u2069"
u"\u2066"
# u"\u200c"
u"\u2068"
u"\u2067"
"]+", flags=re.UNICODE)
text = wierd_pattern.sub(r'', text)
# removing extra spaces, hashtags
text = re.sub("#", "", text)
text = re.sub("\s+", " ", text)
return text
tokenizer = AutoTokenizer.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection")
model = AutoModelForSequenceClassification.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection")
st.title("Offensive or Not?")
prompt = st.text_area(label="Send a message")
button = st.button("send")
if prompt:
normalized_prompt = cleaning(prompt)
encoding = tokenizer(normalized_prompt, return_tensors="pt")
encoding = {k: v.to(model.device) for k,v in encoding.items()}
outputs = model(**encoding)
logits = outputs.logits
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
score = probs.item()
st.markdown(f"Offensive: score {score}" if score > 0.5 else f"Not Offensive: score {score}")