# -*- coding: utf-8 -*- """Emotion Detection NLP Mental Health Automatically generated by Colab. Original file is located at https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/emotion-detection-nlp-mental-health-07377912-eef1-476c-bca0-e3f3abe2bc31.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20250205/auto/storage/goog4_request%26X-Goog-Date%3D20250205T063040Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3379ac810304cc40b0fa5fa915ff09212c0da161bbdae3190bbb13f09d158e28ddbebaecc6f31f960598bf39852f632c8d65288530a38effc9d316c50e6ab1a71aedc9066b12ef4487648ede7d5646dbef0283c9eb7a5539c47ac342e640964e13ff9ea00f5ca777b4adc007f3a830e7d9cfccc590924dc8a5057440bfd82b0e97c9739112dba40371f7321d5231ddd5b476890fb7d4fced9ed0ba155fde73046cb775adeadd827f01dcc90a583f7dab149ca3a5c35f2b29df5106ca356258ee13267ac10671a604057af3e053d45fdabb4d1758c1b3f3da38ddbab02762b81b7f717321a649a1b63f8bc5773a8a27377de6214668dd1b1253012ff8017e2850 """ '''# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES, # THEN FEEL FREE TO DELETE THIS CELL. # NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON # ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR # NOTEBOOK. import kagglehub thedevastator_nlp_mental_health_conversations_path = kagglehub.dataset_download('thedevastator/nlp-mental-health-conversations') print('Data source import complete.')''' """# Introduction In recent years, mental health awareness has grown, leading to a greater emphasis on making support more accessible to everyone. Artificial Intelligence (AI) is playing a pivotal role in bridging the gap between those in need of mental health advice and the limited number of qualified professionals available. The dataset provided in this project is a valuable resource for developing Natural Language Processing (NLP) models that can assist with mental health support. The dataset used in this project consists of anonymized conversations between patients and experienced psychologists, where we will concentrate on detecting the emotional context of the dialogue. By understanding the emotions present in these exchanges, the NLP model will be able to respond more appropriately and offer tailored advice based on the patient's emotional state. ## Purpose The notebook will explore, preprocess, and model the data with the goal of improving emotion detection in patient conversations. This will allow us to understand the emotional landscape of mental health discussions and create AI systems capable of providing emotionally aware responses. # Libraries """ '''#Download and Extracting Data from Kaggle import os import zipfile''' # Data Preprcessing import string import re from warnings import filterwarnings import matplotlib.pyplot as plt import numpy as np import pandas as pd from PIL import Image import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.sentiment import SentimentIntensityAnalyzer from nltk.stem import WordNetLemmatizer #Label Encouding from sklearn.preprocessing import LabelEncoder from textblob import Word, TextBlob from wordcloud import WordCloud #Feature Extracting from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfVectorizer filterwarnings('ignore') pd.set_option('display.max_columns', None) pd.set_option('display.max_colwidth', None) pd.set_option('display.width', 200) pd.set_option('display.float_format', lambda x: '%.2f' % x) # Download necessary NLTK resources import nltk nltk.download('punkt_tab') nltk.download('stopwords') nltk.download('punkt') nltk.download('wordnet') # Download the wordnet corpus for lemmatization """# Data ## Download and Extracting """ '''# Downlaod the dataset using kaggle API os.system("kaggle datasets download -d thedevastator/nlp-mental-health-conversations") #Extract the download zip files dataset_zip='nlp-mental-health-conversations.zip' extracted_folder='nlp_mental_health_conversations' #Extract the dataset with zipfile.ZipFile(dataset_zip,'r') as zip_ref: zip_ref.extractall(extracted_folder) print("Dataset downloaded and extracted successfully.")''' """## Explore Data""" data=pd.read_csv("/content/train.csv") data.head() reponse=data.loc[0,"Response"] print("Length Before text preprocessing : ",len(reponse)) """## Text Preprocessing - Normlaize - Punctuation - Numbers - StopWords - Lemmezation - Removing Words """ # Initialize the lemmatizer lemmatizer = WordNetLemmatizer() def clean_text(text): # Convert to string text = str(text) # Convert to lowercase text = text.lower() # Remove punctuation text = text.translate(str.maketrans('', '', string.punctuation)) # Remove numbers text = re.sub(r'\d+', '', text) # Tokenize text tokens = word_tokenize(text) # Remove stop words stop_words = set(stopwords.words('english')) tokens = [word for word in tokens if word not in stop_words] # Lemmatize tokens tokens = [lemmatizer.lemmatize(word) for word in tokens] # Join tokens back into a string return ' '.join(tokens) # Apply the clean_text function to your 'Context' column data['Context'] = data['Context'].apply(clean_text) # Remove Rarewords: # Let's remove words used less than 1 temp_Context = pd.Series(' '.join(data['Context']).split()).value_counts() drops = temp_Context[temp_Context <= 1] data['Context'] = data['Context'].apply(lambda x: " ".join(x for x in x.split() if x not in drops)) """## Text visualization""" tf_Context = data["Context"].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index() tf_Context.columns = ["words", "tf"] tf_Context.sort_values("tf", ascending=False) # Barplot for Context tf_Context[tf_Context["tf"] > 300].plot.bar(x="words", y="tf") plt.show() """# Emotions Anaylsis""" from transformers import pipeline # Extract and clean 'Context' column contexts = data['Context'] # Load pre-trained emotion detection model emotion_model = pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base') # Analyze emotions in 'Context' emotions = contexts.apply(lambda x: emotion_model(x)[0]['label']) # Add detected emotions as a new column data['Detected_Emotion'] = emotions data.head() data['Detected_Emotion'].value_counts() """# Feature Extraction""" # Initialize TF-IDF Vectorizer vectorizer = TfidfVectorizer() # Fit and transform the data tfidf_matrix = vectorizer.fit_transform(contexts) # Convert to array (if needed) tfidf_array = tfidf_matrix.toarray() """# Model ## Data Spilting """ from sklearn.model_selection import train_test_split # Split the data X_train, X_test, y_train, y_test = train_test_split(tfidf_array, data['Detected_Emotion'], test_size=0.3, random_state=42) from sklearn.ensemble import RandomForestClassifier # Initialize the model model = RandomForestClassifier() """## Fine Tuning""" from sklearn.model_selection import GridSearchCV # Define the parameter grid param_grid = { 'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20, 30] } # Perform grid search grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy') grid_search.fit(X_train, y_train) # Best parameters print(f'Best parameters: {grid_search.best_params_}') """# Train and Evaluation ## Train """ model = RandomForestClassifier() # Train the model model.fit(X_train, y_train) """## Evaluation""" import seaborn as sns from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Make predictions y_pred = model.predict(X_test) # Calculate accuracy accuracy = accuracy_score(y_test, y_pred) print(f'Accuracy: {accuracy}') # Print classification report print("Classification Report:") print(classification_report(y_test, y_pred)) # Generate confusion matrix conf_matrix = confusion_matrix(y_test, y_pred) print("Confusion Matrix:") print(conf_matrix) # Plot confusion matrix plt.figure(figsize=(10, 7)) sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_pred), yticklabels=np.unique(y_test)) plt.xlabel('Predicted') plt.ylabel('Actual') plt.title('Confusion Matrix') plt.show() """## Test Unseen Data""" # Example new text new_text = ["let's leave i am scared"] # Clean and transform the new text new_text_cleaned = [clean_text(text) for text in new_text] new_text_tfidf = vectorizer.transform(new_text_cleaned) # Predict emotion predicted_emotion = model.predict(new_text_tfidf) print(predicted_emotion)