rajkhanke commited on
Commit
084c216
·
verified ·
1 Parent(s): 30439a0

Upload 6 files

Browse files
Files changed (6) hide show
  1. DVCarFraudDetection.csv +0 -0
  2. Dockerfile +9 -0
  3. RFModel.pkl +3 -0
  4. X_train.csv +0 -0
  5. app.py +690 -0
  6. requirements.txt +16 -0
DVCarFraudDetection.csv ADDED
The diff for this file is too large to render. See raw diff
 
Dockerfile ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /app
4
+
5
+ ADD . /app
6
+ RUN python3 -m pip install --upgrade pip
7
+ RUN pip install -r requirements.txt
8
+
9
+ CMD [ "python","app.py" ]
RFModel.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ca585f9b657db88ecff3b835aeb36c63e36615ef8e5989069450a3c65bde044
3
+ size 41657465
X_train.csv ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,690 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request
2
+ import matplotlib.pyplot as plt
3
+ import pandas as pd
4
+ from joblib import load
5
+ import seaborn as sns
6
+ import io
7
+ from wordcloud import WordCloud
8
+ import base64
9
+ import string
10
+ import nltk
11
+ from nltk.corpus import stopwords
12
+ from nltk.tokenize import word_tokenize
13
+ from nltk.stem import WordNetLemmatizer
14
+ from google_play_scraper import app, Sort, reviews_all
15
+ from nltk.sentiment.vader import SentimentIntensityAnalyzer
16
+ from nltk.corpus import stopwords
17
+ from collections import Counter
18
+ from matplotlib.sankey import Sankey
19
+ import networkx as nx
20
+
21
+ app = Flask(__name__)
22
+
23
+ def preprocess_text(text):
24
+ if text is not None:
25
+ # Convert to lowercase
26
+ text = text.lower()
27
+ # Remove special characters and punctuation
28
+ text = text.translate(str.maketrans('', '', string.punctuation))
29
+ # Tokenize text
30
+ tokens = word_tokenize(text)
31
+ # Remove stopwords
32
+ stop_words = set(stopwords.words('english'))
33
+ tokens = [word for word in tokens if word not in stop_words]
34
+ # Lemmatize tokens
35
+ lemmatizer = WordNetLemmatizer()
36
+ tokens = [lemmatizer.lemmatize(word) for word in tokens]
37
+ # Join tokens back into string
38
+ preprocessed_text = ' '.join(tokens)
39
+ return preprocessed_text
40
+ else:
41
+ return ''
42
+
43
+ def preprocess_dataframe(df):
44
+ # Drop unnecessary columns
45
+ df.drop(['userName', 'reviewId', 'userImage', 'reviewCreatedVersion', 'at'], axis=1, inplace=True)
46
+
47
+ # Convert 'repliedAt' column to datetime
48
+ df['repliedAt'] = pd.to_datetime(df['repliedAt'])
49
+
50
+ # Extract month and year from 'repliedAt'
51
+ df['RepliedMonth'] = df['repliedAt'].dt.month
52
+ df['RepliedYear'] = df['repliedAt'].dt.year
53
+
54
+ # Drop the original 'repliedAt' column
55
+ df.drop('repliedAt', axis=1, inplace=True)
56
+
57
+ # Convert 'replyContent' to binary indicator
58
+ df['IsReplied'] = df['replyContent'].apply(lambda x: 'Yes' if x and x.strip() != '' else 'No')
59
+
60
+ # Drop 'replyContent' column
61
+ df.drop('replyContent', axis=1, inplace=True)
62
+
63
+ # Fill missing values in 'appVersion' with '0'
64
+ df['appVersion'].fillna('0', inplace=True)
65
+
66
+ # Only keep necessary columns (content, score, IsReplied)
67
+ df = df[['content', 'score', 'IsReplied']]
68
+
69
+ return df
70
+
71
+ def analyze_sentiment(text, score):
72
+ # Initialize VADER sentiment analyzer
73
+ analyzer = SentimentIntensityAnalyzer()
74
+ # Perform sentiment analysis
75
+ sentiment_score = analyzer.polarity_scores(text)['compound']
76
+
77
+ if sentiment_score >= 0.05 and score >= 3:
78
+ return 'positive'
79
+ elif sentiment_score <= -0.05 and score < 3:
80
+ return 'negative'
81
+ else:
82
+ return 'neutral'
83
+
84
+ @app.route('/predict/app', methods=['POST'])
85
+ def predict_appFraud():
86
+ # Get the app ID and other necessary data from the form
87
+ app_id = request.form['app-id']
88
+ app_name = request.form['app-name']
89
+
90
+ # Scrape reviews for the specified app
91
+ reviews = reviews_all(app_id, sleep_milliseconds=0, lang="Eng", country="in", sort=Sort.NEWEST)
92
+ df = pd.json_normalize(reviews)
93
+
94
+ # Preprocess the DataFrame
95
+ df = preprocess_dataframe(df)
96
+
97
+ # Perform sentiment analysis
98
+ df['sentiment'] = df.apply(lambda row: analyze_sentiment(row['content'], row['score']), axis=1)
99
+ # Generate result based on sentiment
100
+ positive_count = (df['sentiment'] == 'positive').sum()
101
+ negative_count = (df['sentiment'] == 'negative').sum()
102
+
103
+ if positive_count > negative_count:
104
+ result = "The App is Not Fraud"
105
+ else:
106
+ result = "The App is Fraud"
107
+
108
+ total_reviews = len(df)
109
+ positive_reviews = (df['sentiment'] == 'positive').sum()
110
+ negative_reviews = (df['sentiment'] == 'negative').sum()
111
+ neutral_reviews = (df['sentiment'] == 'neutral').sum()
112
+ average_rating = round(df['score'].mean(), 2)
113
+ positive_percentage = round((positive_reviews / total_reviews) * 100, 2)
114
+ negative_percentage = round((negative_reviews / total_reviews) * 100, 2)
115
+ neutral_percentage = round((neutral_reviews / total_reviews) * 100, 2)
116
+ replied_percentage = round((df['IsReplied'] == 'Yes').mean() * 100, 2)
117
+
118
+ # Generate visualizations
119
+ # 1. Percentage pie chart of reviews
120
+ reviews_counts = df['sentiment'].value_counts()
121
+ labels = reviews_counts.index
122
+ colors = ['red', 'green', 'blue']
123
+ plt.figure(figsize=(6, 4))
124
+ plt.pie(reviews_counts, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
125
+ plt.title('Percentage of Reviews in Fraud App')
126
+ buffer1 = io.BytesIO()
127
+ plt.savefig(buffer1, format='png')
128
+ buffer1.seek(0)
129
+ buffer_data1 = base64.b64encode(buffer1.getvalue()).decode()
130
+ plt.close()
131
+
132
+ # 2. Count plot of each type of review
133
+ plt.figure(figsize=(6, 4))
134
+ sns.countplot(x='sentiment', data=df, palette={'positive': 'green', 'negative': 'red', 'neutral': 'blue'})
135
+ plt.title('Count of Each Review Type in Fraud App')
136
+ plt.xlabel('Sentiment')
137
+ plt.ylabel('Count')
138
+ buffer2 = io.BytesIO()
139
+ plt.savefig(buffer2, format='png')
140
+ buffer2.seek(0)
141
+ buffer_data2 = base64.b64encode(buffer2.getvalue()).decode()
142
+ plt.close()
143
+
144
+ # 3. Histogram for each type of score
145
+ plt.figure(figsize=(6, 4))
146
+ sns.histplot(data=df, x='score', hue='sentiment', multiple='stack', bins=20)
147
+ plt.title('Histogram of Rating for Each Review Type in Fraud App')
148
+ plt.xlabel('Score')
149
+ plt.ylabel('Count')
150
+ buffer3 = io.BytesIO()
151
+ plt.savefig(buffer3, format='png')
152
+ buffer3.seek(0)
153
+ buffer_data3 = base64.b64encode(buffer3.getvalue()).decode()
154
+ plt.close()
155
+
156
+ # 4. Pie chart of isreplied (Yes vs No)
157
+ replied_counts = df['IsReplied'].value_counts()
158
+ labels = replied_counts.index
159
+ plt.figure(figsize=(6, 4))
160
+ plt.pie(replied_counts, labels=labels, autopct='%1.1f%%', startangle=140, colors=['lightgreen', 'lightcoral'])
161
+ plt.title('Percentage of Replies in Fraud App Reviews')
162
+ buffer4 = io.BytesIO()
163
+ plt.savefig(buffer4, format='png')
164
+ buffer4.seek(0)
165
+ buffer_data4 = base64.b64encode(buffer4.getvalue()).decode()
166
+ plt.close()
167
+
168
+ # 5. Violin plot of review vs score
169
+ plt.figure(figsize=(6, 4))
170
+ sns.violinplot(x='sentiment', y='score', data=df, palette={'positive': 'green', 'negative': 'red', 'neutral': 'blue'})
171
+ plt.title('Violin Plot of Review vs Rating in Fraud App')
172
+ plt.xlabel('Sentiment')
173
+ plt.ylabel('Score')
174
+ buffer5 = io.BytesIO()
175
+ plt.savefig(buffer5, format='png')
176
+ buffer5.seek(0)
177
+ buffer_data5 = base64.b64encode(buffer5.getvalue()).decode()
178
+ plt.close()
179
+
180
+ # 6. Joint count plot for positive, negative, and neutral reviews based on isreplied (Yes or No)
181
+ plt.figure(figsize=(6, 4)) # Set the size of the figure
182
+ sns.catplot(x='sentiment', kind='count', hue='IsReplied', data=df, palette='Set1',height=4,aspect=1)
183
+ plt.title('Sentiments vs Review Reply Status')
184
+ plt.xlabel('Sentiment')
185
+ plt.ylabel('Count')
186
+ plt.tight_layout()
187
+ buffer6 = io.BytesIO()
188
+ plt.savefig(buffer6, format='png')
189
+ buffer6.seek(0)
190
+ buffer_data6 = base64.b64encode(buffer6.getvalue()).decode()
191
+ plt.close()
192
+
193
+ # Render template with result and any other data you want to display
194
+ return render_template('app_result.html', result=result, app_name=app_name,
195
+ total_reviews=total_reviews, positive_reviews=positive_reviews,
196
+ negative_reviews=negative_reviews, neutral_reviews=neutral_reviews,
197
+ average_rating=average_rating, positive_percentage=positive_percentage,
198
+ negative_percentage=negative_percentage, neutral_percentage=neutral_percentage, replied_percentage=replied_percentage, plot1=buffer_data1, plot2=buffer_data2,
199
+ plot3=buffer_data3, plot4=buffer_data4, plot5=buffer_data5, plot6=buffer_data6)
200
+
201
+ # Load the pre-trained model
202
+ best_rf_classifier = load('RFModel.pkl')
203
+
204
+ # Load X_train
205
+ X_train = pd.read_csv('X_train.csv')
206
+
207
+ # Load the dataset
208
+ df = pd.read_csv('DVCarFraudDetection.csv')
209
+
210
+ @app.route('/')
211
+ def index():
212
+ return render_template('index.html')
213
+
214
+ @app.route('/vehicle_insurance')
215
+ def vehicle_insurance():
216
+ return render_template('vehicle.html')
217
+
218
+ @app.route('/predict/insurance')
219
+ def predict_insurance():
220
+ return render_template('vehicle.html')
221
+
222
+ @app.route('/dataset')
223
+ def dataset_display():
224
+ # Generate visualizations
225
+ fig1, ax1 = plt.subplots(figsize=(6, 4))
226
+ sns.countplot(y='CarCompany', data=df)
227
+ buffer1 = io.BytesIO()
228
+ plt.savefig(buffer1, format='png')
229
+ buffer1.seek(0)
230
+ buffer_data1 = base64.b64encode(buffer1.getvalue()).decode()
231
+ plt.close(fig1)
232
+
233
+ fig2, ax2 = plt.subplots(figsize=(6, 4))
234
+ sns.countplot(x='BasePolicy', hue='IsFraud', data=df, palette={0: 'green', 1: 'red'})
235
+ buffer2 = io.BytesIO()
236
+ plt.savefig(buffer2, format='png')
237
+ buffer2.seek(0)
238
+ buffer_data2 = base64.b64encode(buffer2.getvalue()).decode()
239
+ plt.close(fig2)
240
+
241
+ fig3, ax3 = plt.subplots(figsize=(6, 4))
242
+ past_claims_counts = df['PastNumberOfClaims'].value_counts()
243
+ ax3.pie(past_claims_counts, labels=past_claims_counts.index, autopct='%1.1f%%')
244
+ ax3.set_title('Past Number of Claims Count')
245
+ buffer3 = io.BytesIO()
246
+ plt.savefig(buffer3, format='png')
247
+ buffer3.seek(0)
248
+ buffer_data3 = base64.b64encode(buffer3.getvalue()).decode()
249
+ plt.close(fig3)
250
+
251
+ fig4, ax4 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
252
+ sns.countplot(x='IsAddressChanged', hue='IsFraud', data=df, palette={0: 'green', 1: 'red'})
253
+ ax4.set_title('Address Change and Fraud Distribution')
254
+ ax4.set_xlabel('Is Address Changed?')
255
+ ax4.set_ylabel('Count')
256
+ plt.legend(title='Is Fraud')
257
+ buffer4 = io.BytesIO()
258
+ plt.savefig(buffer4, format='png')
259
+ buffer4.seek(0)
260
+ buffer_data4 = base64.b64encode(buffer4.getvalue()).decode()
261
+ plt.close(fig4)
262
+
263
+ fig5, ax5 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
264
+ heatmap_data = df.groupby(['CarCompany', 'OwnerGender']).size().unstack()
265
+ sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', ax=ax5)
266
+ ax5.set_title('Car Company vs Owner Gender')
267
+ ax5.set_xlabel('Owner Gender')
268
+ ax5.set_ylabel('Car Company')
269
+ plt.yticks(rotation=0) # Rotate y-axis labels for better readability
270
+ plt.tight_layout()
271
+ buffer5 = io.BytesIO()
272
+ plt.savefig(buffer5, format='png')
273
+ buffer5.seek(0)
274
+ buffer_data5 = base64.b64encode(buffer5.getvalue()).decode()
275
+ plt.close(fig5)
276
+
277
+ fig6, ax6 = plt.subplots(figsize=(6, 4))
278
+ num_supplements_counts = df['NumberOfSuppliments'].value_counts()
279
+ ax6.pie(num_supplements_counts, labels=num_supplements_counts.index, autopct='%1.1f%%')
280
+ ax6.set_title('NUmber of Suplements Count')
281
+ buffer6 = io.BytesIO()
282
+ plt.savefig(buffer6, format='png')
283
+ buffer6.seek(0)
284
+ buffer_data6 = base64.b64encode(buffer6.getvalue()).decode()
285
+ plt.close(fig6)
286
+
287
+
288
+ fig7, ax7 = plt.subplots(figsize=(6, 4))
289
+ sns.countplot(x='PoliceReportFiled', hue='IsFraud', data=df)
290
+ buffer7 = io.BytesIO()
291
+ plt.savefig(buffer7, format='png')
292
+ buffer7.seek(0)
293
+ buffer_data7 = base64.b64encode(buffer7.getvalue()).decode()
294
+ plt.close(fig7)
295
+
296
+ fig8, ax8 = plt.subplots(figsize=(6, 4))
297
+ sns.violinplot(x='OwnerGender', y='OwnerAge', data=df, palette={'Male': 'blue', 'Female': 'pink'}, ax=ax8)
298
+ buffer8 = io.BytesIO()
299
+ plt.savefig(buffer8, format='png')
300
+ buffer8.seek(0)
301
+ buffer_data8 = base64.b64encode(buffer8.getvalue()).decode()
302
+ plt.close(fig8)
303
+
304
+ fig9, ax9 = plt.subplots(figsize=(6, 4)) # Create a new figure and axis
305
+ sns.scatterplot(x='OwnerAge', y='NumberOfSuppliments', data=df, ax=ax9)
306
+ plt.title('Scatter Plot of OwnerAge vs NumberOfSuppliments') # Set the title of the plot
307
+ plt.tight_layout() # Ensure tight layout
308
+ buffer9 = io.BytesIO() # Create a BytesIO buffer to store the plot image
309
+ plt.savefig(buffer9, format='png') # Save the plot to the buffer in PNG format
310
+ buffer9.seek(0) # Reset the buffer position to the start
311
+ buffer_data9 = base64.b64encode(buffer9.getvalue()).decode() # Encode the plot image as base64
312
+ plt.close(fig9) # Close the figure to release resources
313
+
314
+
315
+ fig10, ax10 = plt.subplots(figsize=(6, 4))
316
+ sns.boxplot(x='CarCategory', y='CarPrice', data=df, ax=ax10)
317
+ buffer10 = io.BytesIO()
318
+ plt.savefig(buffer10, format='png')
319
+ buffer10.seek(0)
320
+ buffer_data10 = base64.b64encode(buffer10.getvalue()).decode()
321
+ plt.close(fig10)
322
+
323
+
324
+ # Render the dataset template with plots
325
+ return render_template('dataset.html', df=pd.read_csv('env\DVCarFraudDetection.csv'), plot1=buffer_data1, plot2=buffer_data2,
326
+ plot3=buffer_data3, plot4=buffer_data4, plot5=buffer_data5, plot6=buffer_data6,
327
+ plot7=buffer_data7, plot8=buffer_data8, plot9=buffer_data9, plot10=buffer_data10)
328
+
329
+
330
+ @app.route('/predict/insurance', methods=['POST'])
331
+ def make_prediction():
332
+ # Get the form data
333
+ CarCompany = request.form['CarCompany']
334
+ AccidentArea = request.form['AccidentArea']
335
+ OwnerGender = request.form['OwnerGender']
336
+ OwnerAge = int(request.form['OwnerAge'])
337
+ Fault = request.form['Fault']
338
+ CarCategory = request.form['CarCategory']
339
+ CarPrice = int(request.form['CarPrice'])
340
+ PoliceReportFiled = request.form['PoliceReportFiled']
341
+ WitnessPresent = request.form['WitnessPresent']
342
+ AgentType = request.form['AgentType']
343
+ NumberOfSuppliments = int(request.form['NumberOfSuppliments'])
344
+ BasePolicy = request.form['BasePolicy']
345
+ IsAddressChanged = request.form['IsAddressChanged']
346
+ PastNumberOfClaims = int(request.form['PastNumberOfClaims'])
347
+
348
+ # Preprocess the input data
349
+ car_price = CarPrice / 10 # scaling car price as in your previous code
350
+ user_input = {
351
+ 'CarCompany': [CarCompany],
352
+ 'AccidentArea': [AccidentArea],
353
+ 'OwnerGender': [OwnerGender],
354
+ 'OwnerAge': [OwnerAge],
355
+ 'Fault': [Fault],
356
+ 'CarCategory': [CarCategory],
357
+ 'CarPrice': [car_price],
358
+ 'PoliceReportFiled': [PoliceReportFiled],
359
+ 'WitnessPresent': [WitnessPresent],
360
+ 'AgentType': [AgentType],
361
+ 'NumberOfSuppliments': [NumberOfSuppliments],
362
+ 'BasePolicy': [BasePolicy],
363
+ 'IsAddressChanged': [IsAddressChanged],
364
+ 'PastNumberOfClaims': [PastNumberOfClaims]
365
+ }
366
+ user_df = pd.DataFrame(user_input)
367
+ processed_user_input = pd.get_dummies(user_df)
368
+ # Assuming X_train is your training data, you need to replace it with your actual training data
369
+ processed_user_input = processed_user_input.reindex(columns=X_train.columns, fill_value=0)
370
+
371
+ # Make prediction
372
+ prediction = best_rf_classifier.predict(processed_user_input)
373
+
374
+ # Return prediction result
375
+ if prediction[0] == 1:
376
+ result = "Fraud in Insurance"
377
+ else:
378
+ result = "No Fraud in Insurance"
379
+
380
+ # Generate visualizations
381
+ fig1, ax1 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
382
+ sns.countplot(x='OwnerGender', hue='IsFraud', data=df, ax=ax1)
383
+ buffer1 = io.BytesIO()
384
+ plt.savefig(buffer1, format='png')
385
+ buffer1.seek(0)
386
+ buffer_data1 = base64.b64encode(buffer1.getvalue()).decode()
387
+ plt.close(fig1)
388
+
389
+ fig2, ax2 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
390
+ sns.violinplot(x='CarCategory', y='CarPrice', data=df, ax=ax2)
391
+ buffer2 = io.BytesIO()
392
+ plt.savefig(buffer2, format='png')
393
+ buffer2.seek(0)
394
+ buffer_data2 = base64.b64encode(buffer2.getvalue()).decode()
395
+ plt.close(fig2)
396
+
397
+ fig3, ax3 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
398
+ sns.countplot(x='AgentType', hue='IsFraud', data=df, ax=ax3)
399
+ buffer3 = io.BytesIO()
400
+ plt.savefig(buffer3, format='png')
401
+ buffer3.seek(0)
402
+ buffer_data3 = base64.b64encode(buffer3.getvalue()).decode()
403
+ plt.close(fig3)
404
+
405
+ fig4, ax4 = plt.subplots(figsize=(6 , 4)) # Adjust the figsize as per your preference
406
+ policy_fraud_counts = df[df['IsFraud'] == 1]['BasePolicy'].value_counts()
407
+ ax4.pie(policy_fraud_counts, labels=policy_fraud_counts.index, autopct='%1.1f%%')
408
+ buffer4 = io.BytesIO()
409
+ plt.savefig(buffer4, format='png')
410
+ buffer4.seek(0)
411
+ buffer_data4 = base64.b64encode(buffer4.getvalue()).decode()
412
+ plt.close(fig4)
413
+
414
+ fig5, ax5 = plt.subplots(figsize=(6, 4))
415
+ fraud_data = df[df['IsFraud'] == 1]
416
+ non_fraud_data = df[df['IsFraud'] == 0]
417
+ sns.boxplot(x='IsFraud', y='CarPrice', data=fraud_data, ax=ax5)
418
+ sns.boxplot(x='IsFraud', y='CarPrice', data=non_fraud_data, ax=ax5)
419
+ ax5.set_xlabel('Fraud Status')
420
+ ax5.set_ylabel('Car Price')
421
+ ax5.set_title('Box Plot of Car Price for Fraud and Non-Fraud Cases')
422
+ handles, labels = ax5.get_legend_handles_labels()
423
+ ax5.legend(handles, labels)
424
+ buffer5 = io.BytesIO()
425
+ plt.savefig(buffer5, format='png')
426
+ buffer5.seek(0)
427
+ buffer_data5 = base64.b64encode(buffer5.getvalue()).decode()
428
+ plt.close(fig5)
429
+
430
+ fig6, ax6 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
431
+ sns.histplot(data=df, x='PastNumberOfClaims', bins=range(max(df['PastNumberOfClaims'])+2), kde=False, ax=ax6)
432
+ ax6.set_ylabel('Fraud cases count')
433
+ buffer6 = io.BytesIO()
434
+ plt.savefig(buffer6, format='png')
435
+ buffer6.seek(0)
436
+ buffer_data6 = base64.b64encode(buffer6.getvalue()).decode()
437
+ plt.close(fig6)
438
+
439
+ fig7, ax7 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
440
+ policy_fraud_counts = df[df['IsFraud'] == 1]['CarCategory'].value_counts()
441
+ ax7.pie(policy_fraud_counts, labels=policy_fraud_counts.index, autopct='%1.1f%%')
442
+ buffer7 = io.BytesIO()
443
+ plt.savefig(buffer7, format='png')
444
+ buffer7.seek(0)
445
+ buffer_data7 = base64.b64encode(buffer7.getvalue()).decode()
446
+ plt.close(fig7)
447
+
448
+ fig8, ax8 = plt.subplots(figsize=(6, 4)) # Adjust the figsize as per your preference
449
+ sns.countplot(x='PastNumberOfClaims', hue='IsFraud', data=df, ax=ax8)
450
+ buffer8 = io.BytesIO()
451
+ plt.savefig(buffer8, format='png')
452
+ buffer8.seek(0)
453
+ buffer_data8 = base64.b64encode(buffer8.getvalue()).decode()
454
+ plt.close(fig8)
455
+
456
+ # Return prediction result and base64 encoded images
457
+ return render_template('prediction_result.html', result=result, plot1=buffer_data1, plot2=buffer_data2,
458
+ plot3=buffer_data3, plot4=buffer_data4, plot5=buffer_data5, plot6=buffer_data6,
459
+ plot7=buffer_data7, plot8=buffer_data8)
460
+
461
+ @app.route("/predict/app")
462
+ def predict_app():
463
+ return render_template('fraudapp.html')
464
+
465
+
466
+ @app.route("/mobile_app")
467
+ def mobile_app():
468
+ return render_template('fraudapp.html')
469
+
470
+
471
+ @app.route('/analysis/app')
472
+ def analysis_app():
473
+ return render_template('app_analysis.html')
474
+
475
+ @app.route('/analysis/app', methods=['POST'])
476
+ def analysisresult_app():
477
+ app_id = request.form['app-id']
478
+ app_name = request.form['app-name']
479
+
480
+ # Scrape reviews for the specified app
481
+ reviews = reviews_all(app_id, sleep_milliseconds=0, lang="Eng", country="in", sort=Sort.NEWEST)
482
+ df = pd.json_normalize(reviews)
483
+
484
+ # Preprocess the DataFrame
485
+ df = preprocess_dataframe(df)
486
+
487
+ # Perform sentiment analysis
488
+ df['sentiment'] = df.apply(lambda row: analyze_sentiment(row['content'], row['score']), axis=1)
489
+
490
+ # Word Cloud
491
+ text = ' '.join(df['content'].astype(str).tolist())
492
+ wordcloud = WordCloud(width=600, height=400, background_color='white').generate(text)
493
+ img_buffer1 = save_wordcloud_to_buffer(wordcloud)
494
+
495
+ stop_words = set(stopwords.words('english'))
496
+
497
+ # Add more words if necessary
498
+ additional_stopwords = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'])
499
+ stop_words.update(additional_stopwords)
500
+
501
+ # Count Plot of 10 Most Repeated Proper Nouns
502
+ proper_nouns = []
503
+ for review in df['content']:
504
+ words = review.split()
505
+ for word in words:
506
+ if word.istitle() and word.isalpha() and word.lower() not in stop_words:
507
+ proper_nouns.append(word)
508
+ top_proper_nouns = Counter(proper_nouns).most_common(10)
509
+ fig2, ax2 = plt.subplots(figsize=(6, 4))
510
+ sns.countplot(y=proper_nouns, order=[word[0] for word in top_proper_nouns], palette='viridis', ax=ax2)
511
+ ax2.set_title('Count Plot of 10 Most Repeated Proper Nouns')
512
+ ax2.set_xlabel('Count')
513
+ buffer2 = save_plot_to_buffer(fig2)
514
+
515
+ fig3, ax3 = plt.subplots(figsize=(6, 4))
516
+ is_replied_no_df = df[df['IsReplied'] == 'No']
517
+ sentiment_counts = is_replied_no_df['sentiment'].value_counts()
518
+ ax3.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['green', 'red', 'blue'])
519
+ ax3.set_title('Pie Chart of Sentiment Distribution for IsReplied NO')
520
+ buffer3 = save_plot_to_buffer(fig3)
521
+
522
+ # Calculate Review Length
523
+ df['review_length'] = df['content'].apply(lambda x: len(x.split()))
524
+
525
+ # Create a pivot table to aggregate sentiment scores by review length
526
+ sentiment_distribution = df.pivot_table(index='review_length', columns='sentiment', values='score', aggfunc='mean')
527
+
528
+ # Plot the heatmap
529
+ fig4, ax4 = plt.subplots(figsize=(6, 4))
530
+ sns.heatmap(sentiment_distribution, cmap='YlGnBu', linewidths=0.5, ax=ax4)
531
+ ax4.set_title('Sentiment Distribution Heatmap')
532
+ ax4.set_xlabel('Sentiment')
533
+ ax4.set_ylabel('Review Length')
534
+
535
+ # Save the plot to buffer
536
+ buffer4 = save_plot_to_buffer(fig4)
537
+
538
+ # Heatmap of Word Frequency
539
+ word_lengths = df['content'].apply(lambda x: len(x.split()))
540
+ word_freq = pd.DataFrame({'Word Length': word_lengths, 'Rating': df['score']})
541
+
542
+ fig5, ax5 = plt.subplots(figsize=(6, 4))
543
+ sns.heatmap(word_freq.corr(), annot=True, cmap='coolwarm', ax=ax5)
544
+ ax5.set_title('Heatmap of Word Length vs Rating')
545
+ buffer5 = save_plot_to_buffer(fig5)
546
+
547
+ # Joint Count Plot of Score for Positive, Negative, and Neutral
548
+ fig6, ax6 = plt.subplots(figsize=(6, 4))
549
+ sns.histplot(data=df, x='score', hue='sentiment', multiple='stack', palette='husl', ax=ax6)
550
+ ax6.set_title('Joint Count Plot of Score for Positive, Negative, and Neutral')
551
+ ax6.set_xlabel('Score')
552
+ ax6.set_ylabel('Count')
553
+ buffer6 = save_plot_to_buffer(fig6)
554
+
555
+ return render_template('app_analysis_final.html', df=df, app_name=app_name,
556
+ buffer1=img_buffer1, buffer2=buffer2, buffer3=buffer3,
557
+ buffer4=buffer4, buffer5=buffer5, buffer6=buffer6)
558
+
559
+
560
+ # Function to save plot to buffer
561
+ def save_plot_to_buffer(fig):
562
+ buffer = io.BytesIO()
563
+ fig.savefig(buffer, format='png')
564
+ buffer.seek(0)
565
+ buffer_data = base64.b64encode(buffer.getvalue()).decode()
566
+ plt.close(fig)
567
+ return buffer_data
568
+
569
+ # Function to save WordCloud image to buffer
570
+ def save_wordcloud_to_buffer(wordcloud):
571
+ img = wordcloud.to_image()
572
+ img_buffer = io.BytesIO()
573
+ img.save(img_buffer, format='PNG')
574
+ img_buffer.seek(0)
575
+ buffer = base64.b64encode(img_buffer.getvalue()).decode()
576
+ img_buffer.close()
577
+ return buffer
578
+
579
+
580
+ @app.route('/analysis/insurance')
581
+ def analysis_insurance():
582
+ # Generate visualizations
583
+ # Visualization 1: Distribution of Car Prices
584
+ fig1, ax1 = plt.subplots(figsize=(6, 4))
585
+ sns.histplot(df['CarPrice'], kde=True, color='skyblue', ax=ax1)
586
+ ax1.set_title('Distribution of Car Prices')
587
+ ax1.set_xlabel('Car Price')
588
+ ax1.set_ylabel('Frequency')
589
+ buffer1 = save_plot_to_buffer(fig1)
590
+
591
+ # Visualization 2: Distribution of Owner Ages
592
+ fig2, ax2 = plt.subplots(figsize=(6, 4))
593
+ sns.histplot(df['OwnerAge'], kde=True, color='salmon', ax=ax2)
594
+ ax2.set_title('Distribution of Owner Ages')
595
+ ax2.set_xlabel('Owner Age')
596
+ ax2.set_ylabel('Frequency')
597
+ buffer2 = save_plot_to_buffer(fig2)
598
+
599
+ # Visualization 3: Count of Claims by Base Policy
600
+ fig3, ax3 = plt.subplots(figsize=(6, 4))
601
+ sns.countplot(x='CarCategory', hue='IsFraud', data=df, palette='coolwarm', ax=ax3)
602
+ ax3.set_title('Count of Claims by Car category')
603
+ ax3.set_xlabel('Car category')
604
+ ax3.set_ylabel('Count')
605
+ buffer3 = save_plot_to_buffer(fig3)
606
+
607
+ # Visualization 4: Distribution of Car Prices by Fraud Status
608
+ fig4, ax4 = plt.subplots(figsize=(6, 4))
609
+ sns.boxplot(x='IsFraud', y='CarPrice', data=df, palette='Set2', ax=ax4)
610
+ ax4.set_title('Distribution of Car Prices by Fraud Status')
611
+ ax4.set_xlabel('Fraud Status')
612
+ ax4.set_ylabel('Car Price')
613
+ buffer4 = save_plot_to_buffer(fig4)
614
+
615
+ # Visualization 5: Count of Claims by Accident Area
616
+ fig5, ax5 = plt.subplots(figsize=(6, 4))
617
+ sns.countplot(x='AccidentArea', hue='IsFraud', data=df, palette='husl', ax=ax5)
618
+ ax5.set_title('Count of Claims by Accident Area')
619
+ ax5.set_xlabel('Accident Area')
620
+ ax5.set_ylabel('Count')
621
+ buffer5 = save_plot_to_buffer(fig5)
622
+
623
+ # Visualization 6: Distribution of Number of Supplements
624
+ fig6, ax6 = plt.subplots(figsize=(6, 4))
625
+ sns.histplot(df['NumberOfSuppliments'], kde=True, color='orange', ax=ax6)
626
+ ax6.set_title('Distribution of Number of Supplements')
627
+ ax6.set_xlabel('Number of Supplements')
628
+ ax6.set_ylabel('Frequency')
629
+ buffer6 = save_plot_to_buffer(fig6)
630
+
631
+ # Visualization 7: Count of Claims by Witness Presence
632
+ fig7, ax7 = plt.subplots(figsize=(6, 4))
633
+ sns.countplot(x='WitnessPresent', hue='IsFraud', data=df, palette='viridis', ax=ax7)
634
+ ax7.set_title('Count of Claims by Witness Presence')
635
+ ax7.set_xlabel('Witness Presence')
636
+ ax7.set_ylabel('Count')
637
+ buffer7 = save_plot_to_buffer(fig7)
638
+
639
+ # Visualization 8: Distribution of Past Number of Claims
640
+ fig8, ax8 = plt.subplots(figsize=(6, 4))
641
+ sns.histplot(df['PastNumberOfClaims'], kde=True, color='purple', ax=ax8)
642
+ ax8.set_title('Distribution of Past Number of Claims')
643
+ ax8.set_xlabel('Past Number of Claims')
644
+ ax8.set_ylabel('Frequency')
645
+ buffer8 = save_plot_to_buffer(fig8)
646
+
647
+ numeric_columns = df.select_dtypes(include='number')
648
+
649
+ # Compute the correlation matrix
650
+ corr = numeric_columns.corr()
651
+
652
+ # Create the heatmap
653
+ fig9, ax9 = plt.subplots(figsize=(6.5, 4.5))
654
+ sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", ax=ax9)
655
+ ax9.set_title('Heatmap of Correlation Matrix')
656
+ buffer9 = save_plot_to_buffer(fig9)
657
+
658
+ # Visualization 10: Network Graph of Car Brands and Fraud Status
659
+ fig10, ax10 = plt.subplots(figsize=(6, 4))
660
+ G = nx.from_pandas_edgelist(df, 'CarCompany', 'IsFraud')
661
+ nx.draw(G, with_labels=True, node_color='skyblue', node_size=2000, font_size=10, ax=ax10)
662
+ ax10.set_title('Network Graph of Car Brands and Fraud Status')
663
+ buffer10 = save_plot_to_buffer(fig10)
664
+
665
+ # Visualization 11: Violin Plot of Accident Area and Car Price
666
+ fig11, ax11 = plt.subplots(figsize=(6, 4))
667
+ sns.violinplot(x='AccidentArea', y='CarPrice', data=df, hue='IsFraud', split=True, palette='husl', ax=ax11)
668
+ ax11.set_title('Violin Plot of Accident Area and Car Price')
669
+ buffer11 = save_plot_to_buffer(fig11)
670
+
671
+ fig12, ax12 = plt.subplots(figsize=(6, 4))
672
+ hb = ax12.hexbin(df['CarPrice'], df['OwnerAge'], gridsize=50, cmap='inferno')
673
+ ax12.set_title('Hexbin Plot of Car Prices and Owner Ages')
674
+ ax12.set_xlabel('Car Price')
675
+ ax12.set_ylabel('Owner Age')
676
+ cb = fig12.colorbar(hb, ax=ax12)
677
+ cb.set_label('Frequency')
678
+ buffer12 = save_plot_to_buffer(fig12)
679
+
680
+ # Return render template with the additional plots
681
+ return render_template('insurance_analysis.html', plot1=buffer1, plot2=buffer2,
682
+ plot3=buffer3, plot4=buffer4, plot5=buffer5, plot6=buffer6,
683
+ plot7=buffer7, plot8=buffer8, plot9=buffer9, plot10=buffer10,
684
+ plot11=buffer11, plot12=buffer12)
685
+
686
+
687
+ if __name__ == "__main__":
688
+ app.run(debug=True)
689
+
690
+
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Flask
2
+ matplotlib
3
+ gunicorn
4
+ pandas
5
+ joblib
6
+ seaborn
7
+ wordcloud
8
+ nltk
9
+ google-play-scraper
10
+ networkx
11
+ gunicorn
12
+ plotly
13
+ scikit-learn==1.2.2
14
+ numpy==1.25.2
15
+ beautifulsoup4
16
+ requests