CristopherWVSU's picture
More error files fixed
f36eca2
import streamlit as st
import joblib
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# LOAD MODELS AND VECTORIZER
models = {
"Random Forest": "RFCsentimentAnalysis_model.pkl",
"Logistic Regression": "LRsentimentAnalysis_model.pkl",
"Multinomial Naïve Bayes": "MNBsentimentAnalysis_model.pkl"
}
with open("vectorizer.pkl", "rb") as vectorizer_file:
vectorizer = joblib.load(vectorizer_file)
app, model_eval = st.tabs(["Application", "Model Evaluation"])
# STREAMLIT APP TAB 1
with app:
# Sidebar for model selection
st.sidebar.header("Select Model")
model_choice = st.sidebar.selectbox("Choose a model:", list(models.keys()))
# Load selected model
with open(models[model_choice], "rb") as model_file:
model = joblib.load(model_file)
# MAPPING RESULTS
sentiment_mapping = {0: "Neutral", 1: "Positive", 2: "Negative"}
# FUNCTION TO REDUCE TEXT TO ITS MOST BASIC FORM
def clean_text(text):
text = text.lower()
text = re.sub(r'[^a-zA-Z\s]', '', text)
text = ' '.join([word for word in text.split() if word not in stop_words])
return text
# STREAMLIT UI
st.title("Sentiment Analysis App")
st.write("Enter text below to analyze its sentiment.")
user_input = st.text_area("Enter text:")
if st.button("Analyze Sentiment"):
if user_input:
cleaned_input = clean_text(user_input)
transformed_input = vectorizer.transform([cleaned_input])
prediction = model.predict(transformed_input)[0]
sentiment = sentiment_mapping[prediction]
st.write(f"Predicted Sentiment: **{sentiment}**")
else:
st.write("Please enter some text to analyze.")
with model_eval:
st.header("Model Evaluation")
st.write("The Sentiment Analysis model was trained in order to detect if a text is positive, negative, or neutral. The dataset was taken from kaggle.")
st.write("Dataset by Ismiel Hossen Abir. Link: https://www.kaggle.com/datasets/mdismielhossenabir/sentiment-analysis")
# SENTIMENT DISTRIBUTION
st.header("Sentiment Distribution")
st.write("The model was trained using a dataset with the total amount of text equivalent to the following labels")
st.image("sentiment_distribution.png")
# Confusion Matrix
st.title("Confusion Matrix")
st.write("The confusion matrix displays actual vs. predicted labels. Consider the following when interpreting it:")
st.write("- **True Positives (TP):** Correctly predicted Spam")
st.write("- **True Negatives (TN):** Correctly predicted Not Spam")
st.write("- **False Positives (FP):** Predicted Spam but was actually Not Spam (Type I error)")
st.write("- **False Negatives (FN):** Predicted Not Spam but was actually Spam (Type II error)")
st.header("Naive Bayes Confusion Matrix")
st.write("The image below represents the Confusion Matrix of the Naive Bayes model.")
st.image("MNBConfusion Matrix.png")
st.header("Logistic Regression Confusion Matrix")
st.write("The image below represents the Confusion Matrix of the Logistic Regression model.")
st.image("LRconfusion_matrix.png")
st.header("Random Forest Confusion Matrix")
st.write("The image below represents the Confusion Matrix of the Random Forest model.")
st.image("RFCConfusion Matrix.png")
# Evaluation Metrics
st.title("Evaluation Metrics")
st.write("Evaluation metrics help assess the performance of the sentiment analysis.")
st.header("Naive Bayes Evaluation Metrics")
st.write("The image below represents the **Accuracy, F1 score, and classification report** of the Naive Bayes model.")
st.image("MNBclassification_report.png")
st.header("Logistic Regression Evaluation Metrics")
st.write("The image below represents the **Accuracy, F1 score, and classification report** of the Logistic Regression model.")
st.image("LRclassification_report.png")
st.header("Random Forest Evaluation Metrics")
st.write("The image below represents the **Accuracy, F1 score, and classification report** of the Random Forest Classifier model.")
st.image("RFCclassification_report.png")
# COMPARISON
st.header("Comparison")
st.write("Based on the confusion matrix and evaluation metrics, we can assume that out of the three classification algorithms chosen, Logistic Regression and Random Forests performs better than the Naive Bayes.")