|
|
|
import streamlit as st |
|
import joblib |
|
import pandas as pd |
|
import numpy as np |
|
from transformers import AutoModelForSequenceClassification |
|
import skops.hub_utils as hub_utils |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
|
|
|
|
nltk.download('stopwords') |
|
nltk.download('punkt') |
|
arabic_stopwords = set(stopwords.words('arabic')) |
|
|
|
TOP_labels = { |
|
0: 'A GENERAL WORKS', |
|
1: 'B PHILOSOPHY. PSYCHOLOGY. RELIGION', |
|
2: 'C AUXILIARY SCIENCES OF HISTORY', |
|
3: 'D WORLD HISTORY AND HISTORY OF EUROPE, ASIA, AFRICA, AUSTRALIA, NEW ZEALAND, ETC.', |
|
4: 'E HISTORY OF THE AMERICAS CONTENANT', |
|
5: 'F HISTORY OF THE AMERICAS LOCAL', |
|
6: 'G GEOGRAPHY. ANTHROPOLOGY. RECREATION', |
|
7: 'H SOCIAL SCIENCES', |
|
8: 'J POLITICAL SCIENCE', |
|
9: 'K LAW', |
|
10: 'L EDUCATION', |
|
11: 'M MUSIC', |
|
12: 'N FINE ARTS', |
|
13: 'P LANGUAGE AND LITERATURE', |
|
14: 'Q SCIENCE', |
|
15: 'R MEDICINE', |
|
16: 'S AGRICULTURE', |
|
17: 'T TECHNOLOGY', |
|
18: 'U MILITARY SCIENCE', |
|
19: 'V NAVAL SCIENCE', |
|
20: 'W MEDICINE AND RELATED SUBJECTS', |
|
21: 'Z BIBLIOGRAPHY. LIBRARY SCIENCE. INFORMATION RESOURCES' |
|
} |
|
|
|
|
|
|
|
|
|
loaded_count_vect_top = joblib.load('models/top_count_vectorizer_apr17.pkl') |
|
print("_top count_vectorizer model loaded") |
|
|
|
|
|
loaded_tf_transformer_top = joblib.load('models/top_tfidf_transformer_apr17.pkl') |
|
print("_top tfidf_transformer model loaded") |
|
|
|
|
|
loaded_model_top = joblib.load('models/top_trained_model_apr17.pkl') |
|
print("_top trained_model model loaded") |
|
|
|
|
|
def remove_tashkeel(text): |
|
tashkeel = "ูููููููู" |
|
for char in tashkeel: |
|
text = text.replace(char, '') |
|
return text |
|
|
|
|
|
def remove_arabic_stopwords(text): |
|
arabic_stopwords = set(stopwords.words('arabic')) |
|
words = text.split() |
|
filtered_words = [word for word in words if word not in arabic_stopwords] |
|
return ' '.join(filtered_words) |
|
|
|
|
|
def check_TOP(to_predict): |
|
p_count = loaded_count_vect_top.transform([remove_tashkeel(to_predict)]) |
|
p_tfidf = loaded_tf_transformer_top.transform(p_count) |
|
|
|
|
|
top_number = loaded_model_top.predict(p_tfidf)[0] |
|
|
|
|
|
top_name = TOP_labels[top_number] |
|
themaxresX = f"{top_name} N#: {top_number}" |
|
|
|
|
|
probabilities = loaded_model_top.predict_proba(p_tfidf)[0] * 100 |
|
|
|
|
|
sorted_indices = np.argsort(probabilities)[::-1] |
|
top_predictions = ['% {} {}'.format(round(probabilities[i], 4), TOP_labels[i]) for i in sorted_indices[:4]] |
|
|
|
return themaxresX, top_predictions |
|
|
|
|
|
def get_final_result(text): |
|
top_result, top_predictions = check_TOP(remove_arabic_stopwords(text)) |
|
print("Text: ", text) |
|
print("Top:", top_result) |
|
|
|
if top_result.split(" ")[0] == "A": |
|
sub_result, sub_top_predictions = check_subCategory_A(remove_arabic_stopwords(text)) |
|
print("Sub:", sub_result) |
|
|
|
print() |
|
print("------------") |
|
print("Top Predictions:") |
|
for prediction in top_predictions: |
|
print(prediction) |
|
print() |
|
|
|
|
|
|
|
|
|
def process_text(text_input): |
|
if text_input: |
|
|
|
features_list = [] |
|
data = pd.DataFrame(features_list) |
|
|
|
|
|
model_id = "Alshargi/arabic-msa-dialects-segmentation" |
|
model = AutoModelForSequenceClassification.from_pretrained(model_id) |
|
|
|
|
|
res = hub_utils.get_model_output(model, data) |
|
|
|
|
|
return res |
|
else: |
|
return "Please enter some text." |
|
|
|
|
|
def main(): |
|
st.title("Arabic Segmentation Model Output with Streamlit") |
|
|
|
|
|
input_text = st.text_input("Enter your text:") |
|
|
|
|
|
if st.button("Process"): |
|
output = process_text(input_text) |
|
result = prepare_text(input_text) |
|
st.write("Model Output:") |
|
st.write(result) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|