Spaces:

Alshargi
/

sam

Sleeping

App Files Files Community

sam / app.py

Alshargi

Update app.py

8586ba1 verified 11 months ago

raw

history blame contribute delete

4.3 kB


	import streamlit as st
	import joblib
	import pandas as pd
	import numpy as np
	from transformers import AutoModelForSequenceClassification
	import skops.hub_utils as hub_utils
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize

	# Download NLTK resources including the Arabic stopwords
	nltk.download('stopwords')
	nltk.download('punkt')
	arabic_stopwords = set(stopwords.words('arabic'))

	TOP_labels = {
	0: 'A GENERAL WORKS',
	1: 'B PHILOSOPHY. PSYCHOLOGY. RELIGION',
	2: 'C AUXILIARY SCIENCES OF HISTORY',
	3: 'D WORLD HISTORY AND HISTORY OF EUROPE, ASIA, AFRICA, AUSTRALIA, NEW ZEALAND, ETC.',
	4: 'E HISTORY OF THE AMERICAS CONTENANT',
	5: 'F HISTORY OF THE AMERICAS LOCAL',
	6: 'G GEOGRAPHY. ANTHROPOLOGY. RECREATION',
	7: 'H SOCIAL SCIENCES',
	8: 'J POLITICAL SCIENCE',
	9: 'K LAW',
	10: 'L EDUCATION',
	11: 'M MUSIC',
	12: 'N FINE ARTS',
	13: 'P LANGUAGE AND LITERATURE',
	14: 'Q SCIENCE',
	15: 'R MEDICINE',
	16: 'S AGRICULTURE',
	17: 'T TECHNOLOGY',
	18: 'U MILITARY SCIENCE',
	19: 'V NAVAL SCIENCE',
	20: 'W MEDICINE AND RELATED SUBJECTS',
	21: 'Z BIBLIOGRAPHY. LIBRARY SCIENCE. INFORMATION RESOURCES'
	}


	# Load models
	# Load CountVectorizer
	loaded_count_vect_top = joblib.load('models/top_count_vectorizer_apr17.pkl')
	print("_top count_vectorizer model loaded")

	# Load TfidfTransformer
	loaded_tf_transformer_top = joblib.load('models/top_tfidf_transformer_apr17.pkl')
	print("_top tfidf_transformer model loaded")

	# Load the saved model
	loaded_model_top = joblib.load('models/top_trained_model_apr17.pkl')
	print("_top trained_model model loaded")


	def remove_tashkeel(text):
	tashkeel = "ًٌٍَُِّْ"
	for char in tashkeel:
	text = text.replace(char, '')
	return text


	def remove_arabic_stopwords(text):
	arabic_stopwords = set(stopwords.words('arabic'))
	words = text.split()
	filtered_words = [word for word in words if word not in arabic_stopwords]
	return ' '.join(filtered_words)


	def check_TOP(to_predict):
	p_count = loaded_count_vect_top.transform([remove_tashkeel(to_predict)])
	p_tfidf = loaded_tf_transformer_top.transform(p_count)

	# Predict the subcategory
	top_number = loaded_model_top.predict(p_tfidf)[0]

	# Get subcategory details
	top_name = TOP_labels[top_number]
	themaxresX = f"{top_name} N#: {top_number}"

	# Get predicted probabilities for each subcategory
	probabilities = loaded_model_top.predict_proba(p_tfidf)[0] * 100

	# Sort the probabilities and get top predictions
	sorted_indices = np.argsort(probabilities)[::-1] # Sort in descending order
	top_predictions = ['% {} {}'.format(round(probabilities[i], 4), TOP_labels[i]) for i in sorted_indices[:4]]

	return themaxresX, top_predictions


	def get_final_result(text):
	top_result, top_predictions = check_TOP(remove_arabic_stopwords(text))
	print("Text: ", text)
	print("Top:", top_result)

	if top_result.split(" ")[0] == "A":
	sub_result, sub_top_predictions = check_subCategory_A(remove_arabic_stopwords(text))
	print("Sub:", sub_result)

	print()
	print("------------")
	print("Top Predictions:")
	for prediction in top_predictions:
	print(prediction)
	print()




	def process_text(text_input):
	if text_input:
	# Extract features
	features_list = [] # Assuming features function is defined elsewhere
	data = pd.DataFrame(features_list)

	# Load the model from the Hub
	model_id = "Alshargi/arabic-msa-dialects-segmentation"
	model = AutoModelForSequenceClassification.from_pretrained(model_id)

	# Get model output using hub_utils
	res = hub_utils.get_model_output(model, data)

	# Return the model output
	return res
	else:
	return "Please enter some text."


	def main():
	st.title("Arabic Segmentation Model Output with Streamlit")

	# Text input
	input_text = st.text_input("Enter your text:")

	# Process the text when a button is clicked
	if st.button("Process"):
	output = process_text(input_text)
	result = prepare_text(input_text)
	st.write("Model Output:")
	st.write(result)


	if __name__ == "__main__":
	main()