import streamlit as st
import joblib
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# LOAD MODELS AND VECTORIZER
models = {
    "Random Forest": "RFCsentimentAnalysis_model.pkl",
    "Logistic Regression": "LRsentimentAnalysis_model.pkl",
    "Multinomial Naïve Bayes": "MNBsentimentAnalysis_model.pkl"
}

with open("vectorizer.pkl", "rb") as vectorizer_file:
    vectorizer = joblib.load(vectorizer_file)

app, model_eval = st.tabs(["Application", "Model Evaluation"])

# STREAMLIT APP TAB 1
with app:
    # Sidebar for model selection
    st.sidebar.header("Select Model")
    model_choice = st.sidebar.selectbox("Choose a model:", list(models.keys()))
    
    # Load selected model
    with open(models[model_choice], "rb") as model_file:
        model = joblib.load(model_file)
    
    # MAPPING RESULTS
    sentiment_mapping = {0: "Neutral", 1: "Positive", 2: "Negative"}

    # FUNCTION TO REDUCE TEXT TO ITS MOST BASIC FORM 
    def clean_text(text):
        text = text.lower() 
        text = re.sub(r'[^a-zA-Z\s]', '', text)  
        text = ' '.join([word for word in text.split() if word not in stop_words])  
        return text

    # STREAMLIT UI
    st.title("Sentiment Analysis App")
    st.write("Enter text below to analyze its sentiment.")

    user_input = st.text_area("Enter text:")

    if st.button("Analyze Sentiment"):
        if user_input:
            cleaned_input = clean_text(user_input)
            transformed_input = vectorizer.transform([cleaned_input])
            
            prediction = model.predict(transformed_input)[0]
            sentiment = sentiment_mapping[prediction]
            
            st.write(f"Predicted Sentiment: **{sentiment}**")
        else:
            st.write("Please enter some text to analyze.")

with model_eval:

    st.header("Model Evaluation")
    st.write("The Sentiment Analysis model was trained in order to detect if a text is positive, negative, or neutral. The dataset was taken from kaggle.")
    st.write("Dataset by Ismiel Hossen Abir. Link: https://www.kaggle.com/datasets/mdismielhossenabir/sentiment-analysis")
    
    # SENTIMENT DISTRIBUTION
    st.header("Sentiment Distribution")
    st.write("The model was trained using a dataset with the total amount of text equivalent to the following labels")
    st.image("sentiment_distribution.png")

        # Confusion Matrix
    st.title("Confusion Matrix")
    st.write("The confusion matrix displays actual vs. predicted labels. Consider the following when interpreting it:")
    st.write("- **True Positives (TP):** Correctly predicted Spam")
    st.write("- **True Negatives (TN):** Correctly predicted Not Spam")
    st.write("- **False Positives (FP):** Predicted Spam but was actually Not Spam (Type I error)")
    st.write("- **False Negatives (FN):** Predicted Not Spam but was actually Spam (Type II error)")

    st.header("Naive Bayes Confusion Matrix")
    st.write("The image below represents the Confusion Matrix of the Naive Bayes model.")
    st.image("MNBConfusion Matrix.png")

    st.header("Logistic Regression Confusion Matrix")
    st.write("The image below represents the Confusion Matrix of the Logistic Regression model.")
    st.image("LRconfusion_matrix.png")


    st.header("Random Forest Confusion Matrix")
    st.write("The image below represents the Confusion Matrix of the Random Forest model.")
    st.image("RFCConfusion Matrix.png")


    # Evaluation Metrics
    st.title("Evaluation Metrics")
    st.write("Evaluation metrics help assess the performance of the sentiment analysis.")

    st.header("Naive Bayes Evaluation Metrics")
    st.write("The image below represents the **Accuracy, F1 score, and classification report** of the Naive Bayes model.")
    st.image("MNBclassification_report.png")
    
    st.header("Logistic Regression Evaluation Metrics")
    st.write("The image below represents the **Accuracy, F1 score, and classification report** of the Logistic Regression model.")
    st.image("LRclassification_report.png")
    
    st.header("Random Forest Evaluation Metrics")
    st.write("The image below represents the **Accuracy, F1 score, and classification report** of the Random Forest Classifier model.")
    st.image("RFCclassification_report.png")

    # COMPARISON

    st.header("Comparison")
    st.write("Based on the confusion matrix and evaluation metrics, we can assume that out of the three classification algorithms chosen, Logistic Regression and Random Forests performs better than the Naive Bayes.")