Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""Emotion Detection NLP Mental Health
|
3 |
+
|
4 |
+
Automatically generated by Colab.
|
5 |
+
|
6 |
+
Original file is located at
|
7 |
+
https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/emotion-detection-nlp-mental-health-07377912-eef1-476c-bca0-e3f3abe2bc31.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20250205/auto/storage/goog4_request%26X-Goog-Date%3D20250205T063040Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3379ac810304cc40b0fa5fa915ff09212c0da161bbdae3190bbb13f09d158e28ddbebaecc6f31f960598bf39852f632c8d65288530a38effc9d316c50e6ab1a71aedc9066b12ef4487648ede7d5646dbef0283c9eb7a5539c47ac342e640964e13ff9ea00f5ca777b4adc007f3a830e7d9cfccc590924dc8a5057440bfd82b0e97c9739112dba40371f7321d5231ddd5b476890fb7d4fced9ed0ba155fde73046cb775adeadd827f01dcc90a583f7dab149ca3a5c35f2b29df5106ca356258ee13267ac10671a604057af3e053d45fdabb4d1758c1b3f3da38ddbab02762b81b7f717321a649a1b63f8bc5773a8a27377de6214668dd1b1253012ff8017e2850
|
8 |
+
"""
|
9 |
+
|
10 |
+
'''# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
|
11 |
+
# THEN FEEL FREE TO DELETE THIS CELL.
|
12 |
+
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
|
13 |
+
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
|
14 |
+
# NOTEBOOK.
|
15 |
+
import kagglehub
|
16 |
+
thedevastator_nlp_mental_health_conversations_path = kagglehub.dataset_download('thedevastator/nlp-mental-health-conversations')
|
17 |
+
|
18 |
+
print('Data source import complete.')'''
|
19 |
+
|
20 |
+
"""# Introduction
|
21 |
+
|
22 |
+
In recent years, mental health awareness has grown, leading to a greater emphasis on making support more accessible to everyone. Artificial Intelligence (AI) is playing a pivotal role in bridging the gap between those in need of mental health advice and the limited number of qualified professionals available. The dataset provided in this project is a valuable resource for developing Natural Language Processing (NLP) models that can assist with mental health support.
|
23 |
+
|
24 |
+
The dataset used in this project consists of anonymized conversations between patients and experienced psychologists, where we will concentrate on detecting the emotional context of the dialogue. By understanding the emotions present in these exchanges, the NLP model will be able to respond more appropriately and offer tailored advice based on the patient's emotional state.
|
25 |
+
|
26 |
+
## Purpose
|
27 |
+
|
28 |
+
The notebook will explore, preprocess, and model the data with the goal of improving emotion detection in patient conversations. This will allow us to understand the emotional landscape of mental health discussions and create AI systems capable of providing emotionally aware responses.
|
29 |
+
|
30 |
+
# Libraries
|
31 |
+
"""
|
32 |
+
|
33 |
+
'''#Download and Extracting Data from Kaggle
|
34 |
+
import os
|
35 |
+
import zipfile'''
|
36 |
+
|
37 |
+
|
38 |
+
# Data Preprcessing
|
39 |
+
import string
|
40 |
+
import re
|
41 |
+
from warnings import filterwarnings
|
42 |
+
import matplotlib.pyplot as plt
|
43 |
+
import numpy as np
|
44 |
+
import pandas as pd
|
45 |
+
from PIL import Image
|
46 |
+
|
47 |
+
|
48 |
+
import nltk
|
49 |
+
from nltk.corpus import stopwords
|
50 |
+
from nltk.tokenize import word_tokenize
|
51 |
+
from nltk.sentiment import SentimentIntensityAnalyzer
|
52 |
+
from nltk.stem import WordNetLemmatizer
|
53 |
+
|
54 |
+
#Label Encouding
|
55 |
+
from sklearn.preprocessing import LabelEncoder
|
56 |
+
from textblob import Word, TextBlob
|
57 |
+
from wordcloud import WordCloud
|
58 |
+
|
59 |
+
#Feature Extracting
|
60 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
61 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
62 |
+
|
63 |
+
|
64 |
+
filterwarnings('ignore')
|
65 |
+
pd.set_option('display.max_columns', None)
|
66 |
+
pd.set_option('display.max_colwidth', None)
|
67 |
+
pd.set_option('display.width', 200)
|
68 |
+
pd.set_option('display.float_format', lambda x: '%.2f' % x)
|
69 |
+
|
70 |
+
# Download necessary NLTK resources
|
71 |
+
import nltk
|
72 |
+
nltk.download('punkt_tab')
|
73 |
+
nltk.download('stopwords')
|
74 |
+
nltk.download('punkt')
|
75 |
+
nltk.download('wordnet') # Download the wordnet corpus for lemmatization
|
76 |
+
|
77 |
+
"""# Data
|
78 |
+
|
79 |
+
## Download and Extracting
|
80 |
+
"""
|
81 |
+
|
82 |
+
'''# Downlaod the dataset using kaggle API
|
83 |
+
os.system("kaggle datasets download -d thedevastator/nlp-mental-health-conversations")
|
84 |
+
|
85 |
+
#Extract the download zip files
|
86 |
+
dataset_zip='nlp-mental-health-conversations.zip'
|
87 |
+
extracted_folder='nlp_mental_health_conversations'
|
88 |
+
|
89 |
+
#Extract the dataset
|
90 |
+
with zipfile.ZipFile(dataset_zip,'r') as zip_ref:
|
91 |
+
zip_ref.extractall(extracted_folder)
|
92 |
+
|
93 |
+
print("Dataset downloaded and extracted successfully.")'''
|
94 |
+
|
95 |
+
"""## Explore Data"""
|
96 |
+
|
97 |
+
data=pd.read_csv("/content/train.csv")
|
98 |
+
|
99 |
+
data.head()
|
100 |
+
|
101 |
+
reponse=data.loc[0,"Response"]
|
102 |
+
print("Length Before text preprocessing : ",len(reponse))
|
103 |
+
|
104 |
+
"""## Text Preprocessing
|
105 |
+
- Normlaize
|
106 |
+
- Punctuation
|
107 |
+
- Numbers
|
108 |
+
- StopWords
|
109 |
+
- Lemmezation
|
110 |
+
- Removing Words
|
111 |
+
"""
|
112 |
+
|
113 |
+
# Initialize the lemmatizer
|
114 |
+
lemmatizer = WordNetLemmatizer()
|
115 |
+
|
116 |
+
def clean_text(text):
|
117 |
+
# Convert to string
|
118 |
+
text = str(text)
|
119 |
+
# Convert to lowercase
|
120 |
+
text = text.lower()
|
121 |
+
# Remove punctuation
|
122 |
+
text = text.translate(str.maketrans('', '', string.punctuation))
|
123 |
+
# Remove numbers
|
124 |
+
text = re.sub(r'\d+', '', text)
|
125 |
+
# Tokenize text
|
126 |
+
tokens = word_tokenize(text)
|
127 |
+
# Remove stop words
|
128 |
+
stop_words = set(stopwords.words('english'))
|
129 |
+
tokens = [word for word in tokens if word not in stop_words]
|
130 |
+
# Lemmatize tokens
|
131 |
+
tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
132 |
+
|
133 |
+
# Join tokens back into a string
|
134 |
+
return ' '.join(tokens)
|
135 |
+
|
136 |
+
# Apply the clean_text function to your 'Context' column
|
137 |
+
data['Context'] = data['Context'].apply(clean_text)
|
138 |
+
|
139 |
+
# Remove Rarewords:
|
140 |
+
|
141 |
+
# Let's remove words used less than 1
|
142 |
+
temp_Context = pd.Series(' '.join(data['Context']).split()).value_counts()
|
143 |
+
drops = temp_Context[temp_Context <= 1]
|
144 |
+
data['Context'] = data['Context'].apply(lambda x: " ".join(x for x in x.split() if x not in drops))
|
145 |
+
|
146 |
+
"""## Text visualization"""
|
147 |
+
|
148 |
+
tf_Context = data["Context"].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()
|
149 |
+
tf_Context.columns = ["words", "tf"]
|
150 |
+
tf_Context.sort_values("tf", ascending=False)
|
151 |
+
|
152 |
+
# Barplot for Context
|
153 |
+
|
154 |
+
tf_Context[tf_Context["tf"] > 300].plot.bar(x="words", y="tf")
|
155 |
+
plt.show()
|
156 |
+
|
157 |
+
"""# Emotions Anaylsis"""
|
158 |
+
|
159 |
+
from transformers import pipeline
|
160 |
+
|
161 |
+
# Extract and clean 'Context' column
|
162 |
+
contexts = data['Context']
|
163 |
+
|
164 |
+
# Load pre-trained emotion detection model
|
165 |
+
emotion_model = pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')
|
166 |
+
|
167 |
+
# Analyze emotions in 'Context'
|
168 |
+
emotions = contexts.apply(lambda x: emotion_model(x)[0]['label'])
|
169 |
+
|
170 |
+
# Add detected emotions as a new column
|
171 |
+
data['Detected_Emotion'] = emotions
|
172 |
+
|
173 |
+
data.head()
|
174 |
+
|
175 |
+
data['Detected_Emotion'].value_counts()
|
176 |
+
|
177 |
+
"""# Feature Extraction"""
|
178 |
+
|
179 |
+
# Initialize TF-IDF Vectorizer
|
180 |
+
vectorizer = TfidfVectorizer()
|
181 |
+
|
182 |
+
# Fit and transform the data
|
183 |
+
tfidf_matrix = vectorizer.fit_transform(contexts)
|
184 |
+
|
185 |
+
# Convert to array (if needed)
|
186 |
+
tfidf_array = tfidf_matrix.toarray()
|
187 |
+
|
188 |
+
"""# Model
|
189 |
+
|
190 |
+
## Data Spilting
|
191 |
+
"""
|
192 |
+
|
193 |
+
from sklearn.model_selection import train_test_split
|
194 |
+
|
195 |
+
# Split the data
|
196 |
+
X_train, X_test, y_train, y_test = train_test_split(tfidf_array, data['Detected_Emotion'], test_size=0.3, random_state=42)
|
197 |
+
|
198 |
+
from sklearn.ensemble import RandomForestClassifier
|
199 |
+
|
200 |
+
# Initialize the model
|
201 |
+
model = RandomForestClassifier()
|
202 |
+
|
203 |
+
"""## Fine Tuning"""
|
204 |
+
|
205 |
+
from sklearn.model_selection import GridSearchCV
|
206 |
+
|
207 |
+
# Define the parameter grid
|
208 |
+
param_grid = {
|
209 |
+
'n_estimators': [100, 200, 300],
|
210 |
+
'max_depth': [None, 10, 20, 30]
|
211 |
+
}
|
212 |
+
|
213 |
+
# Perform grid search
|
214 |
+
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
|
215 |
+
grid_search.fit(X_train, y_train)
|
216 |
+
|
217 |
+
# Best parameters
|
218 |
+
print(f'Best parameters: {grid_search.best_params_}')
|
219 |
+
|
220 |
+
"""# Train and Evaluation
|
221 |
+
|
222 |
+
## Train
|
223 |
+
"""
|
224 |
+
|
225 |
+
model = RandomForestClassifier()
|
226 |
+
# Train the model
|
227 |
+
model.fit(X_train, y_train)
|
228 |
+
|
229 |
+
"""## Evaluation"""
|
230 |
+
|
231 |
+
import seaborn as sns
|
232 |
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
233 |
+
|
234 |
+
# Make predictions
|
235 |
+
y_pred = model.predict(X_test)
|
236 |
+
|
237 |
+
# Calculate accuracy
|
238 |
+
accuracy = accuracy_score(y_test, y_pred)
|
239 |
+
print(f'Accuracy: {accuracy}')
|
240 |
+
|
241 |
+
# Print classification report
|
242 |
+
print("Classification Report:")
|
243 |
+
print(classification_report(y_test, y_pred))
|
244 |
+
|
245 |
+
# Generate confusion matrix
|
246 |
+
conf_matrix = confusion_matrix(y_test, y_pred)
|
247 |
+
print("Confusion Matrix:")
|
248 |
+
print(conf_matrix)
|
249 |
+
|
250 |
+
# Plot confusion matrix
|
251 |
+
plt.figure(figsize=(10, 7))
|
252 |
+
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_pred), yticklabels=np.unique(y_test))
|
253 |
+
plt.xlabel('Predicted')
|
254 |
+
plt.ylabel('Actual')
|
255 |
+
plt.title('Confusion Matrix')
|
256 |
+
plt.show()
|
257 |
+
|
258 |
+
"""## Test Unseen Data"""
|
259 |
+
|
260 |
+
# Example new text
|
261 |
+
new_text = ["let's leave i am scared"]
|
262 |
+
|
263 |
+
# Clean and transform the new text
|
264 |
+
new_text_cleaned = [clean_text(text) for text in new_text]
|
265 |
+
new_text_tfidf = vectorizer.transform(new_text_cleaned)
|
266 |
+
|
267 |
+
# Predict emotion
|
268 |
+
predicted_emotion = model.predict(new_text_tfidf)
|
269 |
+
print(predicted_emotion)
|