krushna123 commited on
Commit
276fedc
·
verified ·
1 Parent(s): cf1e648

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +269 -0
app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Emotion Detection NLP Mental Health
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/#fileId=https%3A//storage.googleapis.com/kaggle-colab-exported-notebooks/emotion-detection-nlp-mental-health-07377912-eef1-476c-bca0-e3f3abe2bc31.ipynb%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com/20250205/auto/storage/goog4_request%26X-Goog-Date%3D20250205T063040Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D3379ac810304cc40b0fa5fa915ff09212c0da161bbdae3190bbb13f09d158e28ddbebaecc6f31f960598bf39852f632c8d65288530a38effc9d316c50e6ab1a71aedc9066b12ef4487648ede7d5646dbef0283c9eb7a5539c47ac342e640964e13ff9ea00f5ca777b4adc007f3a830e7d9cfccc590924dc8a5057440bfd82b0e97c9739112dba40371f7321d5231ddd5b476890fb7d4fced9ed0ba155fde73046cb775adeadd827f01dcc90a583f7dab149ca3a5c35f2b29df5106ca356258ee13267ac10671a604057af3e053d45fdabb4d1758c1b3f3da38ddbab02762b81b7f717321a649a1b63f8bc5773a8a27377de6214668dd1b1253012ff8017e2850
8
+ """
9
+
10
+ '''# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
11
+ # THEN FEEL FREE TO DELETE THIS CELL.
12
+ # NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
13
+ # ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
14
+ # NOTEBOOK.
15
+ import kagglehub
16
+ thedevastator_nlp_mental_health_conversations_path = kagglehub.dataset_download('thedevastator/nlp-mental-health-conversations')
17
+
18
+ print('Data source import complete.')'''
19
+
20
+ """# Introduction
21
+
22
+ In recent years, mental health awareness has grown, leading to a greater emphasis on making support more accessible to everyone. Artificial Intelligence (AI) is playing a pivotal role in bridging the gap between those in need of mental health advice and the limited number of qualified professionals available. The dataset provided in this project is a valuable resource for developing Natural Language Processing (NLP) models that can assist with mental health support.
23
+
24
+ The dataset used in this project consists of anonymized conversations between patients and experienced psychologists, where we will concentrate on detecting the emotional context of the dialogue. By understanding the emotions present in these exchanges, the NLP model will be able to respond more appropriately and offer tailored advice based on the patient's emotional state.
25
+
26
+ ## Purpose
27
+
28
+ The notebook will explore, preprocess, and model the data with the goal of improving emotion detection in patient conversations. This will allow us to understand the emotional landscape of mental health discussions and create AI systems capable of providing emotionally aware responses.
29
+
30
+ # Libraries
31
+ """
32
+
33
+ '''#Download and Extracting Data from Kaggle
34
+ import os
35
+ import zipfile'''
36
+
37
+
38
+ # Data Preprcessing
39
+ import string
40
+ import re
41
+ from warnings import filterwarnings
42
+ import matplotlib.pyplot as plt
43
+ import numpy as np
44
+ import pandas as pd
45
+ from PIL import Image
46
+
47
+
48
+ import nltk
49
+ from nltk.corpus import stopwords
50
+ from nltk.tokenize import word_tokenize
51
+ from nltk.sentiment import SentimentIntensityAnalyzer
52
+ from nltk.stem import WordNetLemmatizer
53
+
54
+ #Label Encouding
55
+ from sklearn.preprocessing import LabelEncoder
56
+ from textblob import Word, TextBlob
57
+ from wordcloud import WordCloud
58
+
59
+ #Feature Extracting
60
+ from sklearn.feature_extraction.text import CountVectorizer
61
+ from sklearn.feature_extraction.text import TfidfVectorizer
62
+
63
+
64
+ filterwarnings('ignore')
65
+ pd.set_option('display.max_columns', None)
66
+ pd.set_option('display.max_colwidth', None)
67
+ pd.set_option('display.width', 200)
68
+ pd.set_option('display.float_format', lambda x: '%.2f' % x)
69
+
70
+ # Download necessary NLTK resources
71
+ import nltk
72
+ nltk.download('punkt_tab')
73
+ nltk.download('stopwords')
74
+ nltk.download('punkt')
75
+ nltk.download('wordnet') # Download the wordnet corpus for lemmatization
76
+
77
+ """# Data
78
+
79
+ ## Download and Extracting
80
+ """
81
+
82
+ '''# Downlaod the dataset using kaggle API
83
+ os.system("kaggle datasets download -d thedevastator/nlp-mental-health-conversations")
84
+
85
+ #Extract the download zip files
86
+ dataset_zip='nlp-mental-health-conversations.zip'
87
+ extracted_folder='nlp_mental_health_conversations'
88
+
89
+ #Extract the dataset
90
+ with zipfile.ZipFile(dataset_zip,'r') as zip_ref:
91
+ zip_ref.extractall(extracted_folder)
92
+
93
+ print("Dataset downloaded and extracted successfully.")'''
94
+
95
+ """## Explore Data"""
96
+
97
+ data=pd.read_csv("/content/train.csv")
98
+
99
+ data.head()
100
+
101
+ reponse=data.loc[0,"Response"]
102
+ print("Length Before text preprocessing : ",len(reponse))
103
+
104
+ """## Text Preprocessing
105
+ - Normlaize
106
+ - Punctuation
107
+ - Numbers
108
+ - StopWords
109
+ - Lemmezation
110
+ - Removing Words
111
+ """
112
+
113
+ # Initialize the lemmatizer
114
+ lemmatizer = WordNetLemmatizer()
115
+
116
+ def clean_text(text):
117
+ # Convert to string
118
+ text = str(text)
119
+ # Convert to lowercase
120
+ text = text.lower()
121
+ # Remove punctuation
122
+ text = text.translate(str.maketrans('', '', string.punctuation))
123
+ # Remove numbers
124
+ text = re.sub(r'\d+', '', text)
125
+ # Tokenize text
126
+ tokens = word_tokenize(text)
127
+ # Remove stop words
128
+ stop_words = set(stopwords.words('english'))
129
+ tokens = [word for word in tokens if word not in stop_words]
130
+ # Lemmatize tokens
131
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
132
+
133
+ # Join tokens back into a string
134
+ return ' '.join(tokens)
135
+
136
+ # Apply the clean_text function to your 'Context' column
137
+ data['Context'] = data['Context'].apply(clean_text)
138
+
139
+ # Remove Rarewords:
140
+
141
+ # Let's remove words used less than 1
142
+ temp_Context = pd.Series(' '.join(data['Context']).split()).value_counts()
143
+ drops = temp_Context[temp_Context <= 1]
144
+ data['Context'] = data['Context'].apply(lambda x: " ".join(x for x in x.split() if x not in drops))
145
+
146
+ """## Text visualization"""
147
+
148
+ tf_Context = data["Context"].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis=0).reset_index()
149
+ tf_Context.columns = ["words", "tf"]
150
+ tf_Context.sort_values("tf", ascending=False)
151
+
152
+ # Barplot for Context
153
+
154
+ tf_Context[tf_Context["tf"] > 300].plot.bar(x="words", y="tf")
155
+ plt.show()
156
+
157
+ """# Emotions Anaylsis"""
158
+
159
+ from transformers import pipeline
160
+
161
+ # Extract and clean 'Context' column
162
+ contexts = data['Context']
163
+
164
+ # Load pre-trained emotion detection model
165
+ emotion_model = pipeline('sentiment-analysis', model='j-hartmann/emotion-english-distilroberta-base')
166
+
167
+ # Analyze emotions in 'Context'
168
+ emotions = contexts.apply(lambda x: emotion_model(x)[0]['label'])
169
+
170
+ # Add detected emotions as a new column
171
+ data['Detected_Emotion'] = emotions
172
+
173
+ data.head()
174
+
175
+ data['Detected_Emotion'].value_counts()
176
+
177
+ """# Feature Extraction"""
178
+
179
+ # Initialize TF-IDF Vectorizer
180
+ vectorizer = TfidfVectorizer()
181
+
182
+ # Fit and transform the data
183
+ tfidf_matrix = vectorizer.fit_transform(contexts)
184
+
185
+ # Convert to array (if needed)
186
+ tfidf_array = tfidf_matrix.toarray()
187
+
188
+ """# Model
189
+
190
+ ## Data Spilting
191
+ """
192
+
193
+ from sklearn.model_selection import train_test_split
194
+
195
+ # Split the data
196
+ X_train, X_test, y_train, y_test = train_test_split(tfidf_array, data['Detected_Emotion'], test_size=0.3, random_state=42)
197
+
198
+ from sklearn.ensemble import RandomForestClassifier
199
+
200
+ # Initialize the model
201
+ model = RandomForestClassifier()
202
+
203
+ """## Fine Tuning"""
204
+
205
+ from sklearn.model_selection import GridSearchCV
206
+
207
+ # Define the parameter grid
208
+ param_grid = {
209
+ 'n_estimators': [100, 200, 300],
210
+ 'max_depth': [None, 10, 20, 30]
211
+ }
212
+
213
+ # Perform grid search
214
+ grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
215
+ grid_search.fit(X_train, y_train)
216
+
217
+ # Best parameters
218
+ print(f'Best parameters: {grid_search.best_params_}')
219
+
220
+ """# Train and Evaluation
221
+
222
+ ## Train
223
+ """
224
+
225
+ model = RandomForestClassifier()
226
+ # Train the model
227
+ model.fit(X_train, y_train)
228
+
229
+ """## Evaluation"""
230
+
231
+ import seaborn as sns
232
+ from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
233
+
234
+ # Make predictions
235
+ y_pred = model.predict(X_test)
236
+
237
+ # Calculate accuracy
238
+ accuracy = accuracy_score(y_test, y_pred)
239
+ print(f'Accuracy: {accuracy}')
240
+
241
+ # Print classification report
242
+ print("Classification Report:")
243
+ print(classification_report(y_test, y_pred))
244
+
245
+ # Generate confusion matrix
246
+ conf_matrix = confusion_matrix(y_test, y_pred)
247
+ print("Confusion Matrix:")
248
+ print(conf_matrix)
249
+
250
+ # Plot confusion matrix
251
+ plt.figure(figsize=(10, 7))
252
+ sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_pred), yticklabels=np.unique(y_test))
253
+ plt.xlabel('Predicted')
254
+ plt.ylabel('Actual')
255
+ plt.title('Confusion Matrix')
256
+ plt.show()
257
+
258
+ """## Test Unseen Data"""
259
+
260
+ # Example new text
261
+ new_text = ["let's leave i am scared"]
262
+
263
+ # Clean and transform the new text
264
+ new_text_cleaned = [clean_text(text) for text in new_text]
265
+ new_text_tfidf = vectorizer.transform(new_text_cleaned)
266
+
267
+ # Predict emotion
268
+ predicted_emotion = model.predict(new_text_tfidf)
269
+ print(predicted_emotion)