|
from flask import Flask, render_template, request |
|
import matplotlib.pyplot as plt |
|
import pandas as pd |
|
from joblib import load |
|
import seaborn as sns |
|
import io |
|
from wordcloud import WordCloud |
|
import base64 |
|
import string |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
from google_play_scraper import app, Sort, reviews_all |
|
from nltk.sentiment.vader import SentimentIntensityAnalyzer |
|
from nltk.corpus import stopwords |
|
from collections import Counter |
|
from matplotlib.sankey import Sankey |
|
import networkx as nx |
|
|
|
app = Flask(__name__) |
|
|
|
def preprocess_text(text): |
|
if text is not None: |
|
|
|
text = text.lower() |
|
|
|
text = text.translate(str.maketrans('', '', string.punctuation)) |
|
|
|
tokens = word_tokenize(text) |
|
|
|
stop_words = set(stopwords.words('english')) |
|
tokens = [word for word in tokens if word not in stop_words] |
|
|
|
lemmatizer = WordNetLemmatizer() |
|
tokens = [lemmatizer.lemmatize(word) for word in tokens] |
|
|
|
preprocessed_text = ' '.join(tokens) |
|
return preprocessed_text |
|
else: |
|
return '' |
|
|
|
def preprocess_dataframe(df): |
|
|
|
df.drop(['userName', 'reviewId', 'userImage', 'reviewCreatedVersion', 'at'], axis=1, inplace=True) |
|
|
|
|
|
df['repliedAt'] = pd.to_datetime(df['repliedAt']) |
|
|
|
|
|
df['RepliedMonth'] = df['repliedAt'].dt.month |
|
df['RepliedYear'] = df['repliedAt'].dt.year |
|
|
|
|
|
df.drop('repliedAt', axis=1, inplace=True) |
|
|
|
|
|
df['IsReplied'] = df['replyContent'].apply(lambda x: 'Yes' if x and x.strip() != '' else 'No') |
|
|
|
|
|
df.drop('replyContent', axis=1, inplace=True) |
|
|
|
|
|
df['appVersion'].fillna('0', inplace=True) |
|
|
|
|
|
df = df[['content', 'score', 'IsReplied']] |
|
|
|
return df |
|
|
|
def analyze_sentiment(text, score): |
|
|
|
analyzer = SentimentIntensityAnalyzer() |
|
|
|
sentiment_score = analyzer.polarity_scores(text)['compound'] |
|
|
|
if sentiment_score >= 0.05 and score >= 3: |
|
return 'positive' |
|
elif sentiment_score <= -0.05 and score < 3: |
|
return 'negative' |
|
else: |
|
return 'neutral' |
|
|
|
@app.route('/predict/app', methods=['POST']) |
|
def predict_appFraud(): |
|
|
|
app_id = request.form['app-id'] |
|
app_name = request.form['app-name'] |
|
|
|
|
|
reviews = reviews_all(app_id, sleep_milliseconds=0, lang="Eng", country="in", sort=Sort.NEWEST) |
|
df = pd.json_normalize(reviews) |
|
|
|
|
|
df = preprocess_dataframe(df) |
|
|
|
|
|
df['sentiment'] = df.apply(lambda row: analyze_sentiment(row['content'], row['score']), axis=1) |
|
|
|
positive_count = (df['sentiment'] == 'positive').sum() |
|
negative_count = (df['sentiment'] == 'negative').sum() |
|
|
|
if positive_count > negative_count: |
|
result = "The App is Not Fraud" |
|
else: |
|
result = "The App is Fraud" |
|
|
|
total_reviews = len(df) |
|
positive_reviews = (df['sentiment'] == 'positive').sum() |
|
negative_reviews = (df['sentiment'] == 'negative').sum() |
|
neutral_reviews = (df['sentiment'] == 'neutral').sum() |
|
average_rating = round(df['score'].mean(), 2) |
|
positive_percentage = round((positive_reviews / total_reviews) * 100, 2) |
|
negative_percentage = round((negative_reviews / total_reviews) * 100, 2) |
|
neutral_percentage = round((neutral_reviews / total_reviews) * 100, 2) |
|
replied_percentage = round((df['IsReplied'] == 'Yes').mean() * 100, 2) |
|
|
|
|
|
|
|
reviews_counts = df['sentiment'].value_counts() |
|
labels = reviews_counts.index |
|
colors = ['red', 'green', 'blue'] |
|
plt.figure(figsize=(6, 4)) |
|
plt.pie(reviews_counts, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140) |
|
plt.title('Percentage of Reviews in Fraud App') |
|
buffer1 = io.BytesIO() |
|
plt.savefig(buffer1, format='png') |
|
buffer1.seek(0) |
|
buffer_data1 = base64.b64encode(buffer1.getvalue()).decode() |
|
plt.close() |
|
|
|
|
|
plt.figure(figsize=(6, 4)) |
|
sns.countplot(x='sentiment', data=df, palette={'positive': 'green', 'negative': 'red', 'neutral': 'blue'}) |
|
plt.title('Count of Each Review Type in Fraud App') |
|
plt.xlabel('Sentiment') |
|
plt.ylabel('Count') |
|
buffer2 = io.BytesIO() |
|
plt.savefig(buffer2, format='png') |
|
buffer2.seek(0) |
|
buffer_data2 = base64.b64encode(buffer2.getvalue()).decode() |
|
plt.close() |
|
|
|
|
|
plt.figure(figsize=(6, 4)) |
|
sns.histplot(data=df, x='score', hue='sentiment', multiple='stack', bins=20) |
|
plt.title('Histogram of Rating for Each Review Type in Fraud App') |
|
plt.xlabel('Score') |
|
plt.ylabel('Count') |
|
buffer3 = io.BytesIO() |
|
plt.savefig(buffer3, format='png') |
|
buffer3.seek(0) |
|
buffer_data3 = base64.b64encode(buffer3.getvalue()).decode() |
|
plt.close() |
|
|
|
|
|
replied_counts = df['IsReplied'].value_counts() |
|
labels = replied_counts.index |
|
plt.figure(figsize=(6, 4)) |
|
plt.pie(replied_counts, labels=labels, autopct='%1.1f%%', startangle=140, colors=['lightgreen', 'lightcoral']) |
|
plt.title('Percentage of Replies in Fraud App Reviews') |
|
buffer4 = io.BytesIO() |
|
plt.savefig(buffer4, format='png') |
|
buffer4.seek(0) |
|
buffer_data4 = base64.b64encode(buffer4.getvalue()).decode() |
|
plt.close() |
|
|
|
|
|
plt.figure(figsize=(6, 4)) |
|
sns.violinplot(x='sentiment', y='score', data=df, palette={'positive': 'green', 'negative': 'red', 'neutral': 'blue'}) |
|
plt.title('Violin Plot of Review vs Rating in Fraud App') |
|
plt.xlabel('Sentiment') |
|
plt.ylabel('Score') |
|
buffer5 = io.BytesIO() |
|
plt.savefig(buffer5, format='png') |
|
buffer5.seek(0) |
|
buffer_data5 = base64.b64encode(buffer5.getvalue()).decode() |
|
plt.close() |
|
|
|
|
|
plt.figure(figsize=(6, 4)) |
|
sns.catplot(x='sentiment', kind='count', hue='IsReplied', data=df, palette='Set1',height=4,aspect=1) |
|
plt.title('Sentiments vs Review Reply Status') |
|
plt.xlabel('Sentiment') |
|
plt.ylabel('Count') |
|
plt.tight_layout() |
|
buffer6 = io.BytesIO() |
|
plt.savefig(buffer6, format='png') |
|
buffer6.seek(0) |
|
buffer_data6 = base64.b64encode(buffer6.getvalue()).decode() |
|
plt.close() |
|
|
|
|
|
return render_template('app_result.html', result=result, app_name=app_name, |
|
total_reviews=total_reviews, positive_reviews=positive_reviews, |
|
negative_reviews=negative_reviews, neutral_reviews=neutral_reviews, |
|
average_rating=average_rating, positive_percentage=positive_percentage, |
|
negative_percentage=negative_percentage, neutral_percentage=neutral_percentage, replied_percentage=replied_percentage, plot1=buffer_data1, plot2=buffer_data2, |
|
plot3=buffer_data3, plot4=buffer_data4, plot5=buffer_data5, plot6=buffer_data6) |
|
|
|
|
|
best_rf_classifier = load('RFModel.pkl') |
|
|
|
|
|
X_train = pd.read_csv('X_train.csv') |
|
|
|
|
|
df = pd.read_csv('DVCarFraudDetection.csv') |
|
|
|
@app.route('/') |
|
def index(): |
|
return render_template('index.html') |
|
|
|
@app.route('/vehicle_insurance') |
|
def vehicle_insurance(): |
|
return render_template('vehicle.html') |
|
|
|
@app.route('/predict/insurance') |
|
def predict_insurance(): |
|
return render_template('vehicle.html') |
|
|
|
@app.route('/dataset') |
|
def dataset_display(): |
|
|
|
fig1, ax1 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(y='CarCompany', data=df) |
|
buffer1 = io.BytesIO() |
|
plt.savefig(buffer1, format='png') |
|
buffer1.seek(0) |
|
buffer_data1 = base64.b64encode(buffer1.getvalue()).decode() |
|
plt.close(fig1) |
|
|
|
fig2, ax2 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(x='BasePolicy', hue='IsFraud', data=df, palette={0: 'green', 1: 'red'}) |
|
buffer2 = io.BytesIO() |
|
plt.savefig(buffer2, format='png') |
|
buffer2.seek(0) |
|
buffer_data2 = base64.b64encode(buffer2.getvalue()).decode() |
|
plt.close(fig2) |
|
|
|
fig3, ax3 = plt.subplots(figsize=(6, 4)) |
|
past_claims_counts = df['PastNumberOfClaims'].value_counts() |
|
ax3.pie(past_claims_counts, labels=past_claims_counts.index, autopct='%1.1f%%') |
|
ax3.set_title('Past Number of Claims Count') |
|
buffer3 = io.BytesIO() |
|
plt.savefig(buffer3, format='png') |
|
buffer3.seek(0) |
|
buffer_data3 = base64.b64encode(buffer3.getvalue()).decode() |
|
plt.close(fig3) |
|
|
|
fig4, ax4 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(x='IsAddressChanged', hue='IsFraud', data=df, palette={0: 'green', 1: 'red'}) |
|
ax4.set_title('Address Change and Fraud Distribution') |
|
ax4.set_xlabel('Is Address Changed?') |
|
ax4.set_ylabel('Count') |
|
plt.legend(title='Is Fraud') |
|
buffer4 = io.BytesIO() |
|
plt.savefig(buffer4, format='png') |
|
buffer4.seek(0) |
|
buffer_data4 = base64.b64encode(buffer4.getvalue()).decode() |
|
plt.close(fig4) |
|
|
|
fig5, ax5 = plt.subplots(figsize=(6, 4)) |
|
heatmap_data = df.groupby(['CarCompany', 'OwnerGender']).size().unstack() |
|
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', ax=ax5) |
|
ax5.set_title('Car Company vs Owner Gender') |
|
ax5.set_xlabel('Owner Gender') |
|
ax5.set_ylabel('Car Company') |
|
plt.yticks(rotation=0) |
|
plt.tight_layout() |
|
buffer5 = io.BytesIO() |
|
plt.savefig(buffer5, format='png') |
|
buffer5.seek(0) |
|
buffer_data5 = base64.b64encode(buffer5.getvalue()).decode() |
|
plt.close(fig5) |
|
|
|
fig6, ax6 = plt.subplots(figsize=(6, 4)) |
|
num_supplements_counts = df['NumberOfSuppliments'].value_counts() |
|
ax6.pie(num_supplements_counts, labels=num_supplements_counts.index, autopct='%1.1f%%') |
|
ax6.set_title('NUmber of Suplements Count') |
|
buffer6 = io.BytesIO() |
|
plt.savefig(buffer6, format='png') |
|
buffer6.seek(0) |
|
buffer_data6 = base64.b64encode(buffer6.getvalue()).decode() |
|
plt.close(fig6) |
|
|
|
|
|
fig7, ax7 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(x='PoliceReportFiled', hue='IsFraud', data=df) |
|
buffer7 = io.BytesIO() |
|
plt.savefig(buffer7, format='png') |
|
buffer7.seek(0) |
|
buffer_data7 = base64.b64encode(buffer7.getvalue()).decode() |
|
plt.close(fig7) |
|
|
|
fig8, ax8 = plt.subplots(figsize=(6, 4)) |
|
sns.violinplot(x='OwnerGender', y='OwnerAge', data=df, palette={'Male': 'blue', 'Female': 'pink'}, ax=ax8) |
|
buffer8 = io.BytesIO() |
|
plt.savefig(buffer8, format='png') |
|
buffer8.seek(0) |
|
buffer_data8 = base64.b64encode(buffer8.getvalue()).decode() |
|
plt.close(fig8) |
|
|
|
fig9, ax9 = plt.subplots(figsize=(6, 4)) |
|
sns.scatterplot(x='OwnerAge', y='NumberOfSuppliments', data=df, ax=ax9) |
|
plt.title('Scatter Plot of OwnerAge vs NumberOfSuppliments') |
|
plt.tight_layout() |
|
buffer9 = io.BytesIO() |
|
plt.savefig(buffer9, format='png') |
|
buffer9.seek(0) |
|
buffer_data9 = base64.b64encode(buffer9.getvalue()).decode() |
|
plt.close(fig9) |
|
|
|
|
|
fig10, ax10 = plt.subplots(figsize=(6, 4)) |
|
sns.boxplot(x='CarCategory', y='CarPrice', data=df, ax=ax10) |
|
buffer10 = io.BytesIO() |
|
plt.savefig(buffer10, format='png') |
|
buffer10.seek(0) |
|
buffer_data10 = base64.b64encode(buffer10.getvalue()).decode() |
|
plt.close(fig10) |
|
|
|
|
|
|
|
return render_template('dataset.html', df=pd.read_csv('DVCarFraudDetection.csv'), plot1=buffer_data1, plot2=buffer_data2, |
|
plot3=buffer_data3, plot4=buffer_data4, plot5=buffer_data5, plot6=buffer_data6, |
|
plot7=buffer_data7, plot8=buffer_data8, plot9=buffer_data9, plot10=buffer_data10) |
|
|
|
|
|
@app.route('/predict/insurance', methods=['POST']) |
|
def make_prediction(): |
|
|
|
CarCompany = request.form['CarCompany'] |
|
AccidentArea = request.form['AccidentArea'] |
|
OwnerGender = request.form['OwnerGender'] |
|
OwnerAge = int(request.form['OwnerAge']) |
|
Fault = request.form['Fault'] |
|
CarCategory = request.form['CarCategory'] |
|
CarPrice = int(request.form['CarPrice']) |
|
PoliceReportFiled = request.form['PoliceReportFiled'] |
|
WitnessPresent = request.form['WitnessPresent'] |
|
AgentType = request.form['AgentType'] |
|
NumberOfSuppliments = int(request.form['NumberOfSuppliments']) |
|
BasePolicy = request.form['BasePolicy'] |
|
IsAddressChanged = request.form['IsAddressChanged'] |
|
PastNumberOfClaims = int(request.form['PastNumberOfClaims']) |
|
|
|
|
|
car_price = CarPrice / 10 |
|
user_input = { |
|
'CarCompany': [CarCompany], |
|
'AccidentArea': [AccidentArea], |
|
'OwnerGender': [OwnerGender], |
|
'OwnerAge': [OwnerAge], |
|
'Fault': [Fault], |
|
'CarCategory': [CarCategory], |
|
'CarPrice': [car_price], |
|
'PoliceReportFiled': [PoliceReportFiled], |
|
'WitnessPresent': [WitnessPresent], |
|
'AgentType': [AgentType], |
|
'NumberOfSuppliments': [NumberOfSuppliments], |
|
'BasePolicy': [BasePolicy], |
|
'IsAddressChanged': [IsAddressChanged], |
|
'PastNumberOfClaims': [PastNumberOfClaims] |
|
} |
|
user_df = pd.DataFrame(user_input) |
|
processed_user_input = pd.get_dummies(user_df) |
|
|
|
processed_user_input = processed_user_input.reindex(columns=X_train.columns, fill_value=0) |
|
|
|
|
|
prediction = best_rf_classifier.predict(processed_user_input) |
|
|
|
|
|
if prediction[0] == 1: |
|
result = "Fraud in Insurance" |
|
else: |
|
result = "No Fraud in Insurance" |
|
|
|
|
|
fig1, ax1 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(x='OwnerGender', hue='IsFraud', data=df, ax=ax1) |
|
buffer1 = io.BytesIO() |
|
plt.savefig(buffer1, format='png') |
|
buffer1.seek(0) |
|
buffer_data1 = base64.b64encode(buffer1.getvalue()).decode() |
|
plt.close(fig1) |
|
|
|
fig2, ax2 = plt.subplots(figsize=(6, 4)) |
|
sns.violinplot(x='CarCategory', y='CarPrice', data=df, ax=ax2) |
|
buffer2 = io.BytesIO() |
|
plt.savefig(buffer2, format='png') |
|
buffer2.seek(0) |
|
buffer_data2 = base64.b64encode(buffer2.getvalue()).decode() |
|
plt.close(fig2) |
|
|
|
fig3, ax3 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(x='AgentType', hue='IsFraud', data=df, ax=ax3) |
|
buffer3 = io.BytesIO() |
|
plt.savefig(buffer3, format='png') |
|
buffer3.seek(0) |
|
buffer_data3 = base64.b64encode(buffer3.getvalue()).decode() |
|
plt.close(fig3) |
|
|
|
fig4, ax4 = plt.subplots(figsize=(6 , 4)) |
|
policy_fraud_counts = df[df['IsFraud'] == 1]['BasePolicy'].value_counts() |
|
ax4.pie(policy_fraud_counts, labels=policy_fraud_counts.index, autopct='%1.1f%%') |
|
buffer4 = io.BytesIO() |
|
plt.savefig(buffer4, format='png') |
|
buffer4.seek(0) |
|
buffer_data4 = base64.b64encode(buffer4.getvalue()).decode() |
|
plt.close(fig4) |
|
|
|
fig5, ax5 = plt.subplots(figsize=(6, 4)) |
|
fraud_data = df[df['IsFraud'] == 1] |
|
non_fraud_data = df[df['IsFraud'] == 0] |
|
sns.boxplot(x='IsFraud', y='CarPrice', data=fraud_data, ax=ax5) |
|
sns.boxplot(x='IsFraud', y='CarPrice', data=non_fraud_data, ax=ax5) |
|
ax5.set_xlabel('Fraud Status') |
|
ax5.set_ylabel('Car Price') |
|
ax5.set_title('Box Plot of Car Price for Fraud and Non-Fraud Cases') |
|
handles, labels = ax5.get_legend_handles_labels() |
|
ax5.legend(handles, labels) |
|
buffer5 = io.BytesIO() |
|
plt.savefig(buffer5, format='png') |
|
buffer5.seek(0) |
|
buffer_data5 = base64.b64encode(buffer5.getvalue()).decode() |
|
plt.close(fig5) |
|
|
|
fig6, ax6 = plt.subplots(figsize=(6, 4)) |
|
sns.histplot(data=df, x='PastNumberOfClaims', bins=range(max(df['PastNumberOfClaims'])+2), kde=False, ax=ax6) |
|
ax6.set_ylabel('Fraud cases count') |
|
buffer6 = io.BytesIO() |
|
plt.savefig(buffer6, format='png') |
|
buffer6.seek(0) |
|
buffer_data6 = base64.b64encode(buffer6.getvalue()).decode() |
|
plt.close(fig6) |
|
|
|
fig7, ax7 = plt.subplots(figsize=(6, 4)) |
|
policy_fraud_counts = df[df['IsFraud'] == 1]['CarCategory'].value_counts() |
|
ax7.pie(policy_fraud_counts, labels=policy_fraud_counts.index, autopct='%1.1f%%') |
|
buffer7 = io.BytesIO() |
|
plt.savefig(buffer7, format='png') |
|
buffer7.seek(0) |
|
buffer_data7 = base64.b64encode(buffer7.getvalue()).decode() |
|
plt.close(fig7) |
|
|
|
fig8, ax8 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(x='PastNumberOfClaims', hue='IsFraud', data=df, ax=ax8) |
|
buffer8 = io.BytesIO() |
|
plt.savefig(buffer8, format='png') |
|
buffer8.seek(0) |
|
buffer_data8 = base64.b64encode(buffer8.getvalue()).decode() |
|
plt.close(fig8) |
|
|
|
|
|
return render_template('prediction_result.html', result=result, plot1=buffer_data1, plot2=buffer_data2, |
|
plot3=buffer_data3, plot4=buffer_data4, plot5=buffer_data5, plot6=buffer_data6, |
|
plot7=buffer_data7, plot8=buffer_data8) |
|
|
|
@app.route("/predict/app") |
|
def predict_app(): |
|
return render_template('fraudapp.html') |
|
|
|
|
|
@app.route("/mobile_app") |
|
def mobile_app(): |
|
return render_template('fraudapp.html') |
|
|
|
|
|
@app.route('/analysis/app') |
|
def analysis_app(): |
|
return render_template('app_analysis.html') |
|
|
|
@app.route('/analysis/app', methods=['POST']) |
|
def analysisresult_app(): |
|
app_id = request.form['app-id'] |
|
app_name = request.form['app-name'] |
|
|
|
|
|
reviews = reviews_all(app_id, sleep_milliseconds=0, lang="Eng", country="in", sort=Sort.NEWEST) |
|
df = pd.json_normalize(reviews) |
|
|
|
|
|
df = preprocess_dataframe(df) |
|
|
|
|
|
df['sentiment'] = df.apply(lambda row: analyze_sentiment(row['content'], row['score']), axis=1) |
|
|
|
|
|
text = ' '.join(df['content'].astype(str).tolist()) |
|
wordcloud = WordCloud(width=600, height=400, background_color='white').generate(text) |
|
img_buffer1 = save_wordcloud_to_buffer(wordcloud) |
|
|
|
stop_words = set(stopwords.words('english')) |
|
|
|
|
|
additional_stopwords = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn']) |
|
stop_words.update(additional_stopwords) |
|
|
|
|
|
proper_nouns = [] |
|
for review in df['content']: |
|
words = review.split() |
|
for word in words: |
|
if word.istitle() and word.isalpha() and word.lower() not in stop_words: |
|
proper_nouns.append(word) |
|
top_proper_nouns = Counter(proper_nouns).most_common(10) |
|
fig2, ax2 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(y=proper_nouns, order=[word[0] for word in top_proper_nouns], palette='viridis', ax=ax2) |
|
ax2.set_title('Count Plot of 10 Most Repeated Proper Nouns') |
|
ax2.set_xlabel('Count') |
|
buffer2 = save_plot_to_buffer(fig2) |
|
|
|
fig3, ax3 = plt.subplots(figsize=(6, 4)) |
|
is_replied_no_df = df[df['IsReplied'] == 'No'] |
|
sentiment_counts = is_replied_no_df['sentiment'].value_counts() |
|
ax3.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=90, colors=['green', 'red', 'blue']) |
|
ax3.set_title('Pie Chart of Sentiment Distribution for IsReplied NO') |
|
buffer3 = save_plot_to_buffer(fig3) |
|
|
|
|
|
df['review_length'] = df['content'].apply(lambda x: len(x.split())) |
|
|
|
|
|
sentiment_distribution = df.pivot_table(index='review_length', columns='sentiment', values='score', aggfunc='mean') |
|
|
|
|
|
fig4, ax4 = plt.subplots(figsize=(6, 4)) |
|
sns.heatmap(sentiment_distribution, cmap='YlGnBu', linewidths=0.5, ax=ax4) |
|
ax4.set_title('Sentiment Distribution Heatmap') |
|
ax4.set_xlabel('Sentiment') |
|
ax4.set_ylabel('Review Length') |
|
|
|
|
|
buffer4 = save_plot_to_buffer(fig4) |
|
|
|
|
|
word_lengths = df['content'].apply(lambda x: len(x.split())) |
|
word_freq = pd.DataFrame({'Word Length': word_lengths, 'Rating': df['score']}) |
|
|
|
fig5, ax5 = plt.subplots(figsize=(6, 4)) |
|
sns.heatmap(word_freq.corr(), annot=True, cmap='coolwarm', ax=ax5) |
|
ax5.set_title('Heatmap of Word Length vs Rating') |
|
buffer5 = save_plot_to_buffer(fig5) |
|
|
|
|
|
fig6, ax6 = plt.subplots(figsize=(6, 4)) |
|
sns.histplot(data=df, x='score', hue='sentiment', multiple='stack', palette='husl', ax=ax6) |
|
ax6.set_title('Joint Count Plot of Score for Positive, Negative, and Neutral') |
|
ax6.set_xlabel('Score') |
|
ax6.set_ylabel('Count') |
|
buffer6 = save_plot_to_buffer(fig6) |
|
|
|
return render_template('app_analysis_final.html', df=df, app_name=app_name, |
|
buffer1=img_buffer1, buffer2=buffer2, buffer3=buffer3, |
|
buffer4=buffer4, buffer5=buffer5, buffer6=buffer6) |
|
|
|
|
|
|
|
def save_plot_to_buffer(fig): |
|
buffer = io.BytesIO() |
|
fig.savefig(buffer, format='png') |
|
buffer.seek(0) |
|
buffer_data = base64.b64encode(buffer.getvalue()).decode() |
|
plt.close(fig) |
|
return buffer_data |
|
|
|
|
|
def save_wordcloud_to_buffer(wordcloud): |
|
img = wordcloud.to_image() |
|
img_buffer = io.BytesIO() |
|
img.save(img_buffer, format='PNG') |
|
img_buffer.seek(0) |
|
buffer = base64.b64encode(img_buffer.getvalue()).decode() |
|
img_buffer.close() |
|
return buffer |
|
|
|
|
|
@app.route('/analysis/insurance') |
|
def analysis_insurance(): |
|
|
|
|
|
fig1, ax1 = plt.subplots(figsize=(6, 4)) |
|
sns.histplot(df['CarPrice'], kde=True, color='skyblue', ax=ax1) |
|
ax1.set_title('Distribution of Car Prices') |
|
ax1.set_xlabel('Car Price') |
|
ax1.set_ylabel('Frequency') |
|
buffer1 = save_plot_to_buffer(fig1) |
|
|
|
|
|
fig2, ax2 = plt.subplots(figsize=(6, 4)) |
|
sns.histplot(df['OwnerAge'], kde=True, color='salmon', ax=ax2) |
|
ax2.set_title('Distribution of Owner Ages') |
|
ax2.set_xlabel('Owner Age') |
|
ax2.set_ylabel('Frequency') |
|
buffer2 = save_plot_to_buffer(fig2) |
|
|
|
|
|
fig3, ax3 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(x='CarCategory', hue='IsFraud', data=df, palette='coolwarm', ax=ax3) |
|
ax3.set_title('Count of Claims by Car category') |
|
ax3.set_xlabel('Car category') |
|
ax3.set_ylabel('Count') |
|
buffer3 = save_plot_to_buffer(fig3) |
|
|
|
|
|
fig4, ax4 = plt.subplots(figsize=(6, 4)) |
|
sns.boxplot(x='IsFraud', y='CarPrice', data=df, palette='Set2', ax=ax4) |
|
ax4.set_title('Distribution of Car Prices by Fraud Status') |
|
ax4.set_xlabel('Fraud Status') |
|
ax4.set_ylabel('Car Price') |
|
buffer4 = save_plot_to_buffer(fig4) |
|
|
|
|
|
fig5, ax5 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(x='AccidentArea', hue='IsFraud', data=df, palette='husl', ax=ax5) |
|
ax5.set_title('Count of Claims by Accident Area') |
|
ax5.set_xlabel('Accident Area') |
|
ax5.set_ylabel('Count') |
|
buffer5 = save_plot_to_buffer(fig5) |
|
|
|
|
|
fig6, ax6 = plt.subplots(figsize=(6, 4)) |
|
sns.histplot(df['NumberOfSuppliments'], kde=True, color='orange', ax=ax6) |
|
ax6.set_title('Distribution of Number of Supplements') |
|
ax6.set_xlabel('Number of Supplements') |
|
ax6.set_ylabel('Frequency') |
|
buffer6 = save_plot_to_buffer(fig6) |
|
|
|
|
|
fig7, ax7 = plt.subplots(figsize=(6, 4)) |
|
sns.countplot(x='WitnessPresent', hue='IsFraud', data=df, palette='viridis', ax=ax7) |
|
ax7.set_title('Count of Claims by Witness Presence') |
|
ax7.set_xlabel('Witness Presence') |
|
ax7.set_ylabel('Count') |
|
buffer7 = save_plot_to_buffer(fig7) |
|
|
|
|
|
fig8, ax8 = plt.subplots(figsize=(6, 4)) |
|
sns.histplot(df['PastNumberOfClaims'], kde=True, color='purple', ax=ax8) |
|
ax8.set_title('Distribution of Past Number of Claims') |
|
ax8.set_xlabel('Past Number of Claims') |
|
ax8.set_ylabel('Frequency') |
|
buffer8 = save_plot_to_buffer(fig8) |
|
|
|
numeric_columns = df.select_dtypes(include='number') |
|
|
|
|
|
corr = numeric_columns.corr() |
|
|
|
|
|
fig9, ax9 = plt.subplots(figsize=(6.5, 4.5)) |
|
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f", ax=ax9) |
|
ax9.set_title('Heatmap of Correlation Matrix') |
|
buffer9 = save_plot_to_buffer(fig9) |
|
|
|
|
|
fig10, ax10 = plt.subplots(figsize=(6, 4)) |
|
G = nx.from_pandas_edgelist(df, 'CarCompany', 'IsFraud') |
|
nx.draw(G, with_labels=True, node_color='skyblue', node_size=2000, font_size=10, ax=ax10) |
|
ax10.set_title('Network Graph of Car Brands and Fraud Status') |
|
buffer10 = save_plot_to_buffer(fig10) |
|
|
|
|
|
fig11, ax11 = plt.subplots(figsize=(6, 4)) |
|
sns.violinplot(x='AccidentArea', y='CarPrice', data=df, hue='IsFraud', split=True, palette='husl', ax=ax11) |
|
ax11.set_title('Violin Plot of Accident Area and Car Price') |
|
buffer11 = save_plot_to_buffer(fig11) |
|
|
|
fig12, ax12 = plt.subplots(figsize=(6, 4)) |
|
hb = ax12.hexbin(df['CarPrice'], df['OwnerAge'], gridsize=50, cmap='inferno') |
|
ax12.set_title('Hexbin Plot of Car Prices and Owner Ages') |
|
ax12.set_xlabel('Car Price') |
|
ax12.set_ylabel('Owner Age') |
|
cb = fig12.colorbar(hb, ax=ax12) |
|
cb.set_label('Frequency') |
|
buffer12 = save_plot_to_buffer(fig12) |
|
|
|
|
|
return render_template('insurance_analysis.html', plot1=buffer1, plot2=buffer2, |
|
plot3=buffer3, plot4=buffer4, plot5=buffer5, plot6=buffer6, |
|
plot7=buffer7, plot8=buffer8, plot9=buffer9, plot10=buffer10, |
|
plot11=buffer11, plot12=buffer12) |
|
|
|
|
|
if __name__ == "__main__": |
|
app.run(debug=True,port=7860,host='0.0.0.0') |
|
|
|
|
|
|