pricing / dynamic_pricing.py
shollercoaster's picture
Update dynamic_pricing.py
3e870fa
# -*- coding: utf-8 -*-
"""dynamic pricing.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1pMuvzwELNm1DsTdL5dfBdA2HCjB6uwgh
"""
# Commented out IPython magic to ensure Python compatibility.
import datetime
import joblib
import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# %matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
dataset = pd.read_csv("Pop_Data.csv")
dataset.head(5)
label_encoder = LabelEncoder()
dataset['day_of_week'] = label_encoder.fit_transform(dataset['day_of_week'])
X_train, X_test, y_train, y_test = train_test_split(dataset.iloc[:, :-1],
dataset.iloc[:, -1],
test_size = 0.3,
random_state = 42)
X_train.info()
"""# EDA"""
X_train = X_train.iloc[:, 3:]
X_test = X_test.iloc[:, 3:]
X_train.info
# plt.figure(figsize = (12, 8))
# plot = sns.countplot(x = 'day_of_week', data = X_train)
# plt.xticks(rotation = 90)
# for p in plot.patches:
# plot.annotate(p.get_height(),
# (p.get_x() + p.get_width() / 2.0,
# p.get_height()),
# ha = 'center',
# va = 'center',
# xytext = (0, 5),
# textcoords = 'offset points')
# plt.title("Price changes based on day")
# plt.xlabel("Day")
# plt.ylabel("Price")
print(sum(X_train["day_of_week"].isnull()))
print(sum(X_test["day_of_week"].isnull()))
print(sum(X_train["hour_of_day"].isnull()))
print(sum(X_test["hour_of_day"].isnull()))
print(sum(X_train["popularity_percent_normal"].isnull()))
print(sum(X_test["popularity_percent_normal"].isnull()))
X_train["popularity_percent_normal"].fillna(X_train["popularity_percent_normal"].astype("float64").mean(), inplace = True)
# X_train = pd.get_dummies(X_train,
# columns = ["day_of_week"],
# drop_first = True)
# X_test = pd.get_dummies(X_test,
# columns = ["day_of_week"],
# drop_first = True)
# missing_cols = set(X_train.columns) - set(X_test.columns)
# for col in missing_cols:
# X_test[col] = 0
# X_test = X_test[X_train.columns]
standardScaler = StandardScaler()
standardScaler.fit(X_train)
X_train = standardScaler.transform(X_train)
X_test = standardScaler.transform(X_test)
linearRegression = LinearRegression()
linearRegression.fit(X_train, y_train)
y_pred = linearRegression.predict(X_test)
r2_score(y_test, y_pred)
rf = RandomForestRegressor(n_estimators = 100)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
r2_score(y_test, y_pred)
def save_model(model, filename):
joblib.dump(model, filename)
# Save the model
save_model(rf, "random_forest_model.pkl")
def predict(data):
dataArr = list(data)
day_of_week_encoded = label_encoder.fit_transform([dataArr[0]])[0]
datapoint = [day_of_week_encoded, dataArr[1], dataArr[2]]
npArr = np.asarray(datapoint).reshape(1,-1)
x_test = standardScaler.fit_transform(npArr)
return rf.predict(x_test)