|
import streamlit as st |
|
from transformers import AutoTokenizer, AutoModelForSequenceClassification |
|
import os |
|
import torch |
|
|
|
from cleantext import clean |
|
import hazm |
|
import re |
|
|
|
def cleanhtml(raw_html): |
|
cleanr = re.compile('<.*?>') |
|
cleantext = re.sub(cleanr, '', raw_html) |
|
return cleantext |
|
|
|
|
|
def cleaning(text): |
|
text = text.strip() |
|
|
|
|
|
text = clean(text, |
|
clean_all=True, |
|
punct=True, |
|
stopwords=True, |
|
stemming=True, |
|
extra_spaces=True |
|
) |
|
|
|
|
|
text = cleanhtml(text) |
|
|
|
|
|
normalizer = hazm.Normalizer() |
|
text = normalizer.normalize(text) |
|
|
|
|
|
wierd_pattern = re.compile("[" |
|
u"\U0001F600-\U0001F64F" |
|
u"\U0001F300-\U0001F5FF" |
|
u"\U0001F680-\U0001F6FF" |
|
u"\U0001F1E0-\U0001F1FF" |
|
u"\U00002702-\U000027B0" |
|
u"\U000024C2-\U0001F251" |
|
u"\U0001f926-\U0001f937" |
|
u'\U00010000-\U0010ffff' |
|
u"\u200d" |
|
u"\u2640-\u2642" |
|
u"\u2600-\u2B55" |
|
u"\u23cf" |
|
u"\u23e9" |
|
u"\u231a" |
|
u"\u3030" |
|
u"\ufe0f" |
|
u"\u2069" |
|
u"\u2066" |
|
|
|
u"\u2068" |
|
u"\u2067" |
|
"]+", flags=re.UNICODE) |
|
|
|
text = wierd_pattern.sub(r'', text) |
|
|
|
|
|
text = re.sub("#", "", text) |
|
text = re.sub("\s+", " ", text) |
|
|
|
return text |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection") |
|
model = AutoModelForSequenceClassification.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection") |
|
|
|
st.title("Offensive or Not?") |
|
prompt = st.text_area(label="Send a message") |
|
button = st.button("send") |
|
|
|
if prompt: |
|
normalized_prompt = cleaning(prompt) |
|
|
|
encoding = tokenizer(normalized_prompt, return_tensors="pt") |
|
encoding = {k: v.to(model.device) for k,v in encoding.items()} |
|
|
|
outputs = model(**encoding) |
|
logits = outputs.logits |
|
|
|
|
|
sigmoid = torch.nn.Sigmoid() |
|
probs = sigmoid(logits.squeeze().cpu()) |
|
score = probs.item() |
|
st.markdown(f"Offensive: score {score}" if score > 0.5 else f"Not Offensive: score {score}") |
|
|
|
|