import streamlit as st import joblib import pandas as pd import numpy as np from transformers import AutoModelForSequenceClassification import skops.hub_utils as hub_utils import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize # Download NLTK resources including the Arabic stopwords nltk.download('stopwords') nltk.download('punkt') arabic_stopwords = set(stopwords.words('arabic')) TOP_labels = { 0: 'A GENERAL WORKS', 1: 'B PHILOSOPHY. PSYCHOLOGY. RELIGION', 2: 'C AUXILIARY SCIENCES OF HISTORY', 3: 'D WORLD HISTORY AND HISTORY OF EUROPE, ASIA, AFRICA, AUSTRALIA, NEW ZEALAND, ETC.', 4: 'E HISTORY OF THE AMERICAS CONTENANT', 5: 'F HISTORY OF THE AMERICAS LOCAL', 6: 'G GEOGRAPHY. ANTHROPOLOGY. RECREATION', 7: 'H SOCIAL SCIENCES', 8: 'J POLITICAL SCIENCE', 9: 'K LAW', 10: 'L EDUCATION', 11: 'M MUSIC', 12: 'N FINE ARTS', 13: 'P LANGUAGE AND LITERATURE', 14: 'Q SCIENCE', 15: 'R MEDICINE', 16: 'S AGRICULTURE', 17: 'T TECHNOLOGY', 18: 'U MILITARY SCIENCE', 19: 'V NAVAL SCIENCE', 20: 'W MEDICINE AND RELATED SUBJECTS', 21: 'Z BIBLIOGRAPHY. LIBRARY SCIENCE. INFORMATION RESOURCES' } # Load models # Load CountVectorizer loaded_count_vect_top = joblib.load('models/top_count_vectorizer_apr17.pkl') print("_top count_vectorizer model loaded") # Load TfidfTransformer loaded_tf_transformer_top = joblib.load('models/top_tfidf_transformer_apr17.pkl') print("_top tfidf_transformer model loaded") # Load the saved model loaded_model_top = joblib.load('models/top_trained_model_apr17.pkl') print("_top trained_model model loaded") def remove_tashkeel(text): tashkeel = "ًٌٍَُِّْ" for char in tashkeel: text = text.replace(char, '') return text def remove_arabic_stopwords(text): arabic_stopwords = set(stopwords.words('arabic')) words = text.split() filtered_words = [word for word in words if word not in arabic_stopwords] return ' '.join(filtered_words) def check_TOP(to_predict): p_count = loaded_count_vect_top.transform([remove_tashkeel(to_predict)]) p_tfidf = loaded_tf_transformer_top.transform(p_count) # Predict the subcategory top_number = loaded_model_top.predict(p_tfidf)[0] # Get subcategory details top_name = TOP_labels[top_number] themaxresX = f"{top_name} N#: {top_number}" # Get predicted probabilities for each subcategory probabilities = loaded_model_top.predict_proba(p_tfidf)[0] * 100 # Sort the probabilities and get top predictions sorted_indices = np.argsort(probabilities)[::-1] # Sort in descending order top_predictions = ['% {} {}'.format(round(probabilities[i], 4), TOP_labels[i]) for i in sorted_indices[:4]] return themaxresX, top_predictions def get_final_result(text): top_result, top_predictions = check_TOP(remove_arabic_stopwords(text)) print("Text: ", text) print("Top:", top_result) if top_result.split(" ")[0] == "A": sub_result, sub_top_predictions = check_subCategory_A(remove_arabic_stopwords(text)) print("Sub:", sub_result) print() print("------------") print("Top Predictions:") for prediction in top_predictions: print(prediction) print() def process_text(text_input): if text_input: # Extract features features_list = [] # Assuming features function is defined elsewhere data = pd.DataFrame(features_list) # Load the model from the Hub model_id = "Alshargi/arabic-msa-dialects-segmentation" model = AutoModelForSequenceClassification.from_pretrained(model_id) # Get model output using hub_utils res = hub_utils.get_model_output(model, data) # Return the model output return res else: return "Please enter some text." def main(): st.title("Arabic Segmentation Model Output with Streamlit") # Text input input_text = st.text_input("Enter your text:") # Process the text when a button is clicked if st.button("Process"): output = process_text(input_text) result = prepare_text(input_text) st.write("Model Output:") st.write(result) if __name__ == "__main__": main()