import streamlit as st
import joblib
import pandas as pd
import numpy as np
from transformers import AutoModelForSequenceClassification
import skops.hub_utils as hub_utils
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources including the Arabic stopwords
nltk.download('stopwords')
nltk.download('punkt')
arabic_stopwords = set(stopwords.words('arabic'))

TOP_labels = {
    0: 'A  GENERAL WORKS',
    1: 'B  PHILOSOPHY. PSYCHOLOGY. RELIGION',
    2: 'C  AUXILIARY SCIENCES OF HISTORY',
    3: 'D  WORLD HISTORY AND HISTORY OF EUROPE, ASIA, AFRICA, AUSTRALIA, NEW ZEALAND, ETC.',
    4: 'E  HISTORY OF THE AMERICAS CONTENANT',
    5: 'F  HISTORY OF THE AMERICAS LOCAL',
    6: 'G  GEOGRAPHY. ANTHROPOLOGY. RECREATION',
    7: 'H  SOCIAL SCIENCES',
    8: 'J  POLITICAL SCIENCE',
    9: 'K  LAW',
    10: 'L  EDUCATION',
    11: 'M  MUSIC',
    12: 'N  FINE ARTS',
    13: 'P  LANGUAGE AND LITERATURE',
    14: 'Q  SCIENCE',
    15: 'R  MEDICINE',
    16: 'S  AGRICULTURE',
    17: 'T  TECHNOLOGY',
    18: 'U  MILITARY SCIENCE',
    19: 'V  NAVAL SCIENCE',
    20: 'W  MEDICINE AND RELATED SUBJECTS',
    21: 'Z  BIBLIOGRAPHY. LIBRARY SCIENCE. INFORMATION RESOURCES'
}


# Load models
# Load CountVectorizer
loaded_count_vect_top = joblib.load('models/top_count_vectorizer_apr17.pkl')
print("_top count_vectorizer model loaded")

# Load TfidfTransformer
loaded_tf_transformer_top = joblib.load('models/top_tfidf_transformer_apr17.pkl')
print("_top tfidf_transformer model loaded")

# Load the saved model
loaded_model_top = joblib.load('models/top_trained_model_apr17.pkl')
print("_top trained_model model loaded")


def remove_tashkeel(text):
    tashkeel = "ًٌٍَُِّْ"
    for char in tashkeel:
        text = text.replace(char, '')
    return text


def remove_arabic_stopwords(text):
    arabic_stopwords = set(stopwords.words('arabic'))
    words = text.split()
    filtered_words = [word for word in words if word not in arabic_stopwords]
    return ' '.join(filtered_words)


def check_TOP(to_predict):
    p_count = loaded_count_vect_top.transform([remove_tashkeel(to_predict)])
    p_tfidf = loaded_tf_transformer_top.transform(p_count)

    # Predict the subcategory
    top_number = loaded_model_top.predict(p_tfidf)[0]

    # Get subcategory details
    top_name = TOP_labels[top_number]
    themaxresX = f"{top_name}  N#: {top_number}"

    # Get predicted probabilities for each subcategory
    probabilities = loaded_model_top.predict_proba(p_tfidf)[0] * 100

    # Sort the probabilities and get top predictions
    sorted_indices = np.argsort(probabilities)[::-1]  # Sort in descending order
    top_predictions = ['% {}  {}'.format(round(probabilities[i], 4), TOP_labels[i]) for i in sorted_indices[:4]]

    return themaxresX, top_predictions


def get_final_result(text):
    top_result, top_predictions = check_TOP(remove_arabic_stopwords(text))
    print("Text: ", text)
    print("Top:", top_result)

    if top_result.split("  ")[0] == "A":
        sub_result, sub_top_predictions = check_subCategory_A(remove_arabic_stopwords(text))
        print("Sub:", sub_result)

    print()
    print("------------")
    print("Top Predictions:")
    for prediction in top_predictions:
        print(prediction)
    print()


def process_text(text_input):
    if text_input:
        # Extract features
        features_list = []  # Assuming features function is defined elsewhere
        data = pd.DataFrame(features_list)

        # Load the model from the Hub
        model_id = "Alshargi/arabic-msa-dialects-segmentation"
        model = AutoModelForSequenceClassification.from_pretrained(model_id)

        # Get model output using hub_utils
        res = hub_utils.get_model_output(model, data)

        # Return the model output
        return res
    else:
        return "Please enter some text."


def main():
    st.title("Arabic Segmentation Model Output with Streamlit")

    # Text input
    input_text = st.text_input("Enter your text:")

    # Process the text when a button is clicked
    if st.button("Process"):
        output = process_text(input_text)
        result = prepare_text(input_text)
        st.write("Model Output:")
        st.write(result)


if __name__ == "__main__":
    main()