|
import os
|
|
import pandas as pd
|
|
import numpy as np
|
|
import torch
|
|
import constants as cst
|
|
from torch.utils import data
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
class LOBSTERDataBuilder:
|
|
def __init__(
|
|
self,
|
|
stocks,
|
|
data_dir,
|
|
date_trading_days,
|
|
split_rates,
|
|
):
|
|
self.n_lob_levels = cst.N_LOB_LEVELS
|
|
self.data_dir = data_dir
|
|
self.date_trading_days = date_trading_days
|
|
self.stocks = stocks
|
|
self.split_rates = split_rates
|
|
self.prepare_save_datasets()
|
|
|
|
|
|
def prepare_save_datasets(self):
|
|
for i in range(len(self.stocks)):
|
|
stock = self.stocks[i]
|
|
path = "{}/{}/{}_{}_{}".format(
|
|
self.data_dir,
|
|
stock,
|
|
stock,
|
|
self.date_trading_days[0],
|
|
self.date_trading_days[1],
|
|
)
|
|
self.dataframes = []
|
|
self._prepare_dataframes(path, stock)
|
|
|
|
path_where_to_save = "{}/{}".format(
|
|
self.data_dir,
|
|
stock,
|
|
)
|
|
|
|
|
|
self._plot_mid_price(self.dataframes[0][1], stock)
|
|
self._compute_and_save_statistics(self.dataframes[0][1], self.dataframes[0][0], path_where_to_save, stock)
|
|
|
|
|
|
def _plot_mid_price(self, orderbook_df, stock):
|
|
|
|
best_bid = orderbook_df["buy1"]
|
|
best_ask = orderbook_df["sell1"]
|
|
mid_price = (best_bid + best_ask) / 2
|
|
date_range = pd.date_range(start="01/02/2015", end="01/30/2015", periods=len(mid_price))
|
|
|
|
|
|
plt.figure(figsize=(10, 6))
|
|
plt.plot(date_range, mid_price, label=f'{stock} Mid-Price')
|
|
plt.xlabel('Time')
|
|
plt.ylabel('Mid-Price')
|
|
plt.title(f'{stock} Mid-Price')
|
|
plt.legend()
|
|
|
|
plt.xticks(rotation=45)
|
|
plt.gca().set_xticks([date_range[0], date_range[-1]])
|
|
plt.gca().set_xticklabels(['01/02/2015', '01/30/2015'])
|
|
|
|
|
|
plot_filename = os.path.join(os.getcwd(), f'{stock}_mid_price_plot.pdf')
|
|
plt.savefig(plot_filename)
|
|
plt.close()
|
|
|
|
def _compute_and_save_statistics(self, orderbook_df, message_df, save_path, stock):
|
|
|
|
best_bid = orderbook_df["buy1"]
|
|
best_ask = orderbook_df["sell1"]
|
|
spread = best_ask - best_bid
|
|
avg_spread = spread.mean()
|
|
liquidity = orderbook_df.iloc[:, 1::2].sum(axis=1).mean()
|
|
avg_liquidity = liquidity.mean()
|
|
self.open_mid_prices = np.array(self.open_mid_prices)
|
|
self.daily_returns = (self.open_mid_prices[1:] - self.open_mid_prices[:-1]) / self.open_mid_prices[:-1]
|
|
|
|
daily_return_std = np.std(self.daily_returns)
|
|
daily_volume_std = np.std(self.daily_volumes)
|
|
daily_return_mean = np.mean(self.daily_returns)
|
|
daily_volume_mean = np.mean(self.daily_volumes)
|
|
|
|
|
|
stats = {
|
|
'daily_return_std': daily_return_std,
|
|
'daily_volume_std': daily_volume_std,
|
|
'daily_return_mean': daily_return_mean,
|
|
'daily_volume_mean': daily_volume_mean,
|
|
'average_spread': avg_spread,
|
|
'avgerage_spread_std': spread.std(),
|
|
'average_liquidity': avg_liquidity,
|
|
'average_liquidity_std': liquidity.std(),
|
|
}
|
|
stats_df = pd.DataFrame([stats])
|
|
stats_filename = os.path.join(save_path, f'{stock}_statistics.csv')
|
|
stats_df.to_csv(stats_filename, index=False)
|
|
|
|
|
|
def _prepare_dataframes(self, path, stock):
|
|
COLUMNS_NAMES = {"orderbook": ["sell1", "vsell1", "buy1", "vbuy1",
|
|
"sell2", "vsell2", "buy2", "vbuy2",
|
|
"sell3", "vsell3", "buy3", "vbuy3",
|
|
"sell4", "vsell4", "buy4", "vbuy4",
|
|
"sell5", "vsell5", "buy5", "vbuy5",
|
|
"sell6", "vsell6", "buy6", "vbuy6",
|
|
"sell7", "vsell7", "buy7", "vbuy7",
|
|
"sell8", "vsell8", "buy8", "vbuy8",
|
|
"sell9", "vsell9", "buy9", "vbuy9",
|
|
"sell10", "vsell10", "buy10", "vbuy10"],
|
|
"message": ["time", "event_type", "order_id", "size", "price", "direction"]}
|
|
self.num_trading_days = len(os.listdir(path))//2
|
|
split_days = self._split_days()
|
|
split_days = [i * 2 for i in split_days]
|
|
self._create_dataframes_splitted(path, split_days, COLUMNS_NAMES)
|
|
|
|
for i in range(len(self.dataframes)):
|
|
self.dataframes[i][0]["price"] = self.dataframes[i][0]["price"] / 10000
|
|
self.dataframes[i][1].loc[:, ::2] /= 10000
|
|
train_input = self.dataframes[0][1].values
|
|
val_input = self.dataframes[1][1].values
|
|
test_input = self.dataframes[2][1].values
|
|
|
|
|
|
|
|
|
|
def _create_dataframes_splitted(self, path, split_days, COLUMNS_NAMES):
|
|
|
|
self.open_mid_prices = []
|
|
self.daily_volumes = []
|
|
for i, filename in enumerate(sorted(os.listdir(path))):
|
|
f = os.path.join(path, filename)
|
|
print(f)
|
|
if os.path.isfile(f):
|
|
|
|
if i < split_days[0]:
|
|
if (i % 2) == 0:
|
|
if i == 0:
|
|
train_messages = pd.read_csv(f, names=COLUMNS_NAMES["message"])
|
|
self.daily_volumes.append(train_messages["size"].sum())
|
|
else:
|
|
train_message = pd.read_csv(f, names=COLUMNS_NAMES["message"])
|
|
self.daily_volumes.append(train_message["size"].sum())
|
|
else:
|
|
if i == 1:
|
|
train_orderbooks = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
|
|
self.open_mid_prices.append(train_orderbooks["sell1"][0] + train_orderbooks["buy1"][0] / 20000)
|
|
if (len(train_orderbooks) != len(train_messages)):
|
|
raise ValueError("train_orderbook length is different than train_messages")
|
|
else:
|
|
train_orderbook = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
|
|
self.open_mid_prices.append(train_orderbook["sell1"][0] + train_orderbook["buy1"][0] / 20000)
|
|
train_messages = pd.concat([train_messages, train_message], axis=0)
|
|
train_orderbooks = pd.concat([train_orderbooks, train_orderbook], axis=0)
|
|
|
|
elif split_days[0] <= i < split_days[1]:
|
|
if (i % 2) == 0:
|
|
if (i == split_days[0]):
|
|
self.dataframes.append([train_messages, train_orderbooks])
|
|
val_messages = pd.read_csv(f, names=COLUMNS_NAMES["message"])
|
|
self.daily_volumes.append(val_messages["size"].sum())
|
|
else:
|
|
val_message = pd.read_csv(f, names=COLUMNS_NAMES["message"])
|
|
self.daily_volumes.append(val_message["size"].sum())
|
|
else:
|
|
if i == split_days[0] + 1:
|
|
val_orderbooks = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
|
|
self.open_mid_prices.append(val_orderbooks["sell1"][0] + val_orderbooks["buy1"][0] / 20000)
|
|
if (len(val_orderbooks) != len(val_messages)):
|
|
raise ValueError("val_orderbook length is different than val_messages")
|
|
else:
|
|
val_orderbook = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
|
|
self.open_mid_prices.append(val_orderbook["sell1"][0] + val_orderbook["buy1"][0] / 20000)
|
|
val_messages = pd.concat([val_messages, val_message], axis=0)
|
|
val_orderbooks = pd.concat([val_orderbooks, val_orderbook], axis=0)
|
|
|
|
else:
|
|
|
|
if (i % 2) == 0:
|
|
if (i == split_days[1]):
|
|
self.dataframes.append([val_messages, val_orderbooks])
|
|
test_messages = pd.read_csv(f, names=COLUMNS_NAMES["message"])
|
|
self.daily_volumes.append(test_messages["size"].sum())
|
|
else:
|
|
test_message = pd.read_csv(f, names=COLUMNS_NAMES["message"])
|
|
self.daily_volumes.append(test_message["size"].sum())
|
|
else:
|
|
if i == split_days[1] + 1:
|
|
test_orderbooks = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
|
|
self.open_mid_prices.append(test_orderbooks["sell1"][0] + test_orderbooks["buy1"][0] / 20000)
|
|
if (len(test_orderbooks) != len(test_messages)):
|
|
raise ValueError("test_orderbook length is different than test_messages")
|
|
else:
|
|
test_orderbook = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
|
|
self.open_mid_prices.append(test_orderbook["sell1"][0] + test_orderbook["buy1"][0] / 20000)
|
|
test_messages = pd.concat([test_messages, test_message], axis=0)
|
|
test_orderbooks = pd.concat([test_orderbooks, test_orderbook], axis=0)
|
|
else:
|
|
raise ValueError("File {} is not a file".format(f))
|
|
self.dataframes.append([test_messages, test_orderbooks])
|
|
|
|
|
|
|
|
|
|
def _split_days(self):
|
|
train = int(self.num_trading_days * self.split_rates[0])
|
|
val = int(self.num_trading_days * self.split_rates[1]) + train
|
|
test = int(self.num_trading_days * self.split_rates[2]) + val
|
|
print(f"There are {train} days for training, {val - train} days for validation and {test - val} days for testing")
|
|
return [train, val, test]
|
|
|
|
|
|
data_builder = LOBSTERDataBuilder(
|
|
stocks=["TSLA"],
|
|
data_dir=cst.DATA_DIR,
|
|
date_trading_days=cst.DATE_TRADING_DAYS,
|
|
split_rates=cst.SPLIT_RATES,
|
|
) |