sam / app.py
Alshargi's picture
Update app.py
8586ba1 verified
import streamlit as st
import joblib
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification
import skops.hub_utils as hub_utils
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
# Download NLTK resources including the Arabic stopwords
nltk.download('stopwords')
nltk.download('punkt')
arabic_stopwords = set(stopwords.words('arabic'))
TOP_labels = {
0: 'A GENERAL WORKS',
1: 'B PHILOSOPHY. PSYCHOLOGY. RELIGION',
2: 'C AUXILIARY SCIENCES OF HISTORY',
3: 'D WORLD HISTORY AND HISTORY OF EUROPE, ASIA, AFRICA, AUSTRALIA, NEW ZEALAND, ETC.',
4: 'E HISTORY OF THE AMERICAS CONTENANT',
5: 'F HISTORY OF THE AMERICAS LOCAL',
6: 'G GEOGRAPHY. ANTHROPOLOGY. RECREATION',
7: 'H SOCIAL SCIENCES',
8: 'J POLITICAL SCIENCE',
9: 'K LAW',
10: 'L EDUCATION',
11: 'M MUSIC',
12: 'N FINE ARTS',
13: 'P LANGUAGE AND LITERATURE',
14: 'Q SCIENCE',
15: 'R MEDICINE',
16: 'S AGRICULTURE',
17: 'T TECHNOLOGY',
18: 'U MILITARY SCIENCE',
19: 'V NAVAL SCIENCE',
20: 'W MEDICINE AND RELATED SUBJECTS',
21: 'Z BIBLIOGRAPHY. LIBRARY SCIENCE. INFORMATION RESOURCES'
}
# Load models
# Load CountVectorizer
loaded_count_vect_top = joblib.load('models/top_count_vectorizer_apr17.pkl')
print("_top count_vectorizer model loaded")
# Load TfidfTransformer
loaded_tf_transformer_top = joblib.load('models/top_tfidf_transformer_apr17.pkl')
print("_top tfidf_transformer model loaded")
# Load the saved model
loaded_model_top = joblib.load('models/top_trained_model_apr17.pkl')
print("_top trained_model model loaded")
def remove_tashkeel(text):
tashkeel = "ู‘ูŽู‹ููŒููู’"
for char in tashkeel:
text = text.replace(char, '')
return text
def remove_arabic_stopwords(text):
arabic_stopwords = set(stopwords.words('arabic'))
words = text.split()
filtered_words = [word for word in words if word not in arabic_stopwords]
return ' '.join(filtered_words)
def check_TOP(to_predict):
p_count = loaded_count_vect_top.transform([remove_tashkeel(to_predict)])
p_tfidf = loaded_tf_transformer_top.transform(p_count)
# Predict the subcategory
top_number = loaded_model_top.predict(p_tfidf)[0]
# Get subcategory details
top_name = TOP_labels[top_number]
themaxresX = f"{top_name} N#: {top_number}"
# Get predicted probabilities for each subcategory
probabilities = loaded_model_top.predict_proba(p_tfidf)[0] * 100
# Sort the probabilities and get top predictions
sorted_indices = np.argsort(probabilities)[::-1] # Sort in descending order
top_predictions = ['% {} {}'.format(round(probabilities[i], 4), TOP_labels[i]) for i in sorted_indices[:4]]
return themaxresX, top_predictions
def get_final_result(text):
top_result, top_predictions = check_TOP(remove_arabic_stopwords(text))
print("Text: ", text)
print("Top:", top_result)
if top_result.split(" ")[0] == "A":
sub_result, sub_top_predictions = check_subCategory_A(remove_arabic_stopwords(text))
print("Sub:", sub_result)
print()
print("------------")
print("Top Predictions:")
for prediction in top_predictions:
print(prediction)
print()
def process_text(text_input):
if text_input:
# Extract features
features_list = [] # Assuming features function is defined elsewhere
data = pd.DataFrame(features_list)
# Load the model from the Hub
model_id = "Alshargi/arabic-msa-dialects-segmentation"
model = AutoModelForSequenceClassification.from_pretrained(model_id)
# Get model output using hub_utils
res = hub_utils.get_model_output(model, data)
# Return the model output
return res
else:
return "Please enter some text."
def main():
st.title("Arabic Segmentation Model Output with Streamlit")
# Text input
input_text = st.text_input("Enter your text:")
# Process the text when a button is clicked
if st.button("Process"):
output = process_text(input_text)
result = prepare_text(input_text)
st.write("Model Output:")
st.write(result)
if __name__ == "__main__":
main()