File size: 3,491 Bytes
4cd6925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e870fa
 
4cd6925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e870fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4cd6925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# -*- coding: utf-8 -*-
"""dynamic pricing.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1pMuvzwELNm1DsTdL5dfBdA2HCjB6uwgh
"""

# Commented out IPython magic to ensure Python compatibility.
import datetime
import joblib
import numpy as np
import pandas as pd

# import matplotlib.pyplot as plt
# import seaborn as sns
# %matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

dataset = pd.read_csv("Pop_Data.csv")
dataset.head(5)

label_encoder = LabelEncoder()
dataset['day_of_week'] = label_encoder.fit_transform(dataset['day_of_week'])

X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1],
                                                    dataset.iloc[:, -1],
                                                    test_size = 0.3,
                                                    random_state = 42)

X_train.info()

"""# EDA"""

X_train = X_train.iloc[:, 3:]
X_test = X_test.iloc[:, 3:]

X_train.info

# plt.figure(figsize = (12, 8))
# plot = sns.countplot(x = 'day_of_week', data = X_train)
# plt.xticks(rotation = 90)
# for p in plot.patches:
#     plot.annotate(p.get_height(),
#                         (p.get_x() + p.get_width() / 2.0,
#                          p.get_height()),
#                         ha = 'center',
#                         va = 'center',
#                         xytext = (0, 5),
#                         textcoords = 'offset points')

# plt.title("Price changes based on day")
# plt.xlabel("Day")
# plt.ylabel("Price")

print(sum(X_train["day_of_week"].isnull()))
print(sum(X_test["day_of_week"].isnull()))

print(sum(X_train["hour_of_day"].isnull()))
print(sum(X_test["hour_of_day"].isnull()))

print(sum(X_train["popularity_percent_normal"].isnull()))
print(sum(X_test["popularity_percent_normal"].isnull()))

X_train["popularity_percent_normal"].fillna(X_train["popularity_percent_normal"].astype("float64").mean(), inplace = True)

# X_train = pd.get_dummies(X_train,
#                          columns = ["day_of_week"],
#                          drop_first = True)

# X_test = pd.get_dummies(X_test,
#                          columns = ["day_of_week"],
#                          drop_first = True)

# missing_cols = set(X_train.columns) - set(X_test.columns)
# for col in missing_cols:
#     X_test[col] = 0
# X_test = X_test[X_train.columns]

standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train = standardScaler.transform(X_train)
X_test = standardScaler.transform(X_test)

linearRegression = LinearRegression()
linearRegression.fit(X_train, y_train)
y_pred = linearRegression.predict(X_test)
r2_score(y_test, y_pred)

rf = RandomForestRegressor(n_estimators = 100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
r2_score(y_test, y_pred)

def save_model(model, filename):
    joblib.dump(model, filename)

# Save the model
save_model(rf, "random_forest_model.pkl")

def predict(data):
    dataArr = list(data)
    day_of_week_encoded = label_encoder.fit_transform([dataArr[0]])[0]
    datapoint = [day_of_week_encoded, dataArr[1], dataArr[2]]
    npArr = np.asarray(datapoint).reshape(1,-1)
    x_test = standardScaler.fit_transform(npArr)
    return rf.predict(x_test)