|
import pandas as pd
|
|
import numpy as np
|
|
import os
|
|
|
|
import torch
|
|
import pandas
|
|
import constants as cst
|
|
|
|
|
|
def z_score_orderbook(data, mean_size=None, mean_prices=None, std_size=None, std_prices=None):
|
|
""" DONE: remember to use the mean/std of the training set, to z-normalize the test set. """
|
|
if (mean_size is None) or (std_size is None):
|
|
mean_size = data.iloc[:, 1::2].stack().mean()
|
|
std_size = data.iloc[:, 1::2].stack().std()
|
|
|
|
|
|
if (mean_prices is None) or (std_prices is None):
|
|
mean_prices = data.iloc[:, 0::2].stack().mean()
|
|
std_prices = data.iloc[:, 0::2].stack().std()
|
|
|
|
|
|
price_cols = data.columns[0::2]
|
|
size_cols = data.columns[1::2]
|
|
|
|
|
|
for col in size_cols:
|
|
data[col] = data[col].astype("float64")
|
|
data[col] = (data[col] - mean_size) / std_size
|
|
|
|
for col in price_cols:
|
|
data[col] = data[col].astype("float64")
|
|
data[col] = (data[col] - mean_prices) / std_prices
|
|
|
|
|
|
if data.isnull().values.any():
|
|
raise ValueError("data contains null value")
|
|
|
|
return data, mean_size, mean_prices, std_size, std_prices
|
|
|
|
|
|
def normalize_messages(data, mean_size=None, mean_prices=None, std_size=None, std_prices=None, mean_time=None, std_time=None, mean_depth=None, std_depth=None):
|
|
|
|
|
|
if (mean_size is None) or (std_size is None):
|
|
mean_size = data["size"].mean()
|
|
std_size = data["size"].std()
|
|
|
|
if (mean_prices is None) or (std_prices is None):
|
|
mean_prices = data["price"].mean()
|
|
std_prices = data["price"].std()
|
|
|
|
if (mean_time is None) or (std_time is None):
|
|
mean_time = data["time"].mean()
|
|
std_time = data["time"].std()
|
|
|
|
if (mean_depth is None) or (std_depth is None):
|
|
mean_depth = data["depth"].mean()
|
|
std_depth = data["depth"].std()
|
|
|
|
|
|
data["time"] = (data["time"] - mean_time) / std_time
|
|
data["size"] = (data["size"] - mean_size) / std_size
|
|
data["price"] = (data["price"] - mean_prices) / std_prices
|
|
data["depth"] = (data["depth"] - mean_depth) / std_depth
|
|
|
|
if data.isnull().values.any():
|
|
raise ValueError("data contains null value")
|
|
|
|
data["event_type"] = data["event_type"]-1.0
|
|
data["event_type"] = data["event_type"].replace(2, 1)
|
|
data["event_type"] = data["event_type"].replace(3, 2)
|
|
|
|
|
|
|
|
return data, mean_size, mean_prices, std_size, std_prices, mean_time, std_time, mean_depth, std_depth
|
|
|
|
|
|
def reset_indexes(dataframes):
|
|
|
|
dataframes[0] = dataframes[0].reset_index(drop=True)
|
|
dataframes[1] = dataframes[1].reset_index(drop=True)
|
|
return dataframes
|
|
|
|
|
|
def sampling_quantity(dataframes, quantity=1000):
|
|
messages_df, orderbook_df = dataframes[0], dataframes[1]
|
|
|
|
|
|
cumsum = messages_df['size'].cumsum()
|
|
sample_mask = (cumsum % quantity < messages_df['size'])
|
|
|
|
|
|
sampled_indices = messages_df.index[sample_mask].tolist()
|
|
|
|
|
|
messages_df = messages_df.loc[sampled_indices].reset_index(drop=True)
|
|
orderbook_df = orderbook_df.loc[sampled_indices].reset_index(drop=True)
|
|
|
|
return [messages_df, orderbook_df]
|
|
|
|
|
|
def sampling_time(dataframes, time):
|
|
|
|
dataframes[0]['time'] = pd.to_datetime(dataframes[0]['time'], unit='s')
|
|
|
|
|
|
resampled_messages = dataframes[0].set_index('time').resample(time).first().dropna().reset_index()
|
|
|
|
|
|
resampled_orderbook = dataframes[1].set_index(dataframes[0]['time']).resample(time).first().dropna().reset_index(drop=True)
|
|
|
|
|
|
dataframes[0] = resampled_messages
|
|
|
|
|
|
dataframes[0]['time'] = dataframes[0]['time'].dt.second + dataframes[0]['time'].dt.minute * 60 + dataframes[0]['time'].dt.hour * 3600 + dataframes[0]['time'].dt.microsecond / 1e6
|
|
dataframes[1] = resampled_orderbook
|
|
|
|
return dataframes
|
|
|
|
|
|
def preprocess_data(dataframes, n_lob_levels, sampling_type, time=None, quantity=None):
|
|
dataframes = reset_indexes(dataframes)
|
|
|
|
dataframes[1] = dataframes[1].iloc[:, :n_lob_levels * cst.LEN_LEVEL]
|
|
|
|
|
|
|
|
|
|
indexes_to_drop = dataframes[0][dataframes[0]["event_type"].isin([2, 5, 6, 7])].index
|
|
dataframes[0] = dataframes[0].drop(indexes_to_drop)
|
|
dataframes[1] = dataframes[1].drop(indexes_to_drop)
|
|
|
|
dataframes = reset_indexes(dataframes)
|
|
|
|
|
|
if sampling_type == "time":
|
|
dataframes = sampling_time(dataframes, time)
|
|
elif sampling_type == "quantity":
|
|
dataframes = sampling_quantity(dataframes, quantity)
|
|
|
|
dataframes = reset_indexes(dataframes)
|
|
|
|
|
|
dataframes[0] = dataframes[0].drop(columns=["order_id"])
|
|
|
|
|
|
|
|
first_time = dataframes[0]["time"].values[0]
|
|
|
|
dataframes[0]["time"] = dataframes[0]["time"].diff()
|
|
|
|
dataframes[0].iat[0, dataframes[0].columns.get_loc("time")] = first_time - 34200
|
|
|
|
|
|
dataframes[0]["depth"] = 0
|
|
|
|
|
|
|
|
prices = dataframes[0]["price"].values
|
|
directions = dataframes[0]["direction"].values
|
|
event_types = dataframes[0]["event_type"].values
|
|
bid_sides = dataframes[1].iloc[:, 2::4].values
|
|
ask_sides = dataframes[1].iloc[:, 0::4].values
|
|
|
|
|
|
depths = np.zeros(dataframes[0].shape[0], dtype=int)
|
|
|
|
|
|
for j in range(1, len(prices)):
|
|
order_price = prices[j]
|
|
direction = directions[j]
|
|
event_type = event_types[j]
|
|
|
|
index = j if event_type == 1 else j - 1
|
|
|
|
if direction == 1:
|
|
bid_price = bid_sides[index, 0]
|
|
depth = (bid_price - order_price) // 100
|
|
else:
|
|
ask_price = ask_sides[index, 0]
|
|
depth = (order_price - ask_price) // 100
|
|
|
|
depths[j] = max(depth, 0)
|
|
|
|
|
|
dataframes[0]["depth"] = depths
|
|
|
|
|
|
dataframes[0] = dataframes[0].iloc[1:, :]
|
|
dataframes[1] = dataframes[1].iloc[1:, :]
|
|
dataframes = reset_indexes(dataframes)
|
|
|
|
dataframes[0]["direction"] = dataframes[0]["direction"] * dataframes[0]["event_type"].apply(
|
|
lambda x: -1 if x == 4 else 1)
|
|
|
|
return dataframes[1], dataframes[0]
|
|
|
|
|
|
def unnormalize(x, mean, std):
|
|
return x * std + mean
|
|
|
|
|
|
def one_hot_encoding_type(data):
|
|
encoded_data = torch.zeros(data.shape[0], data.shape[1] + 2, dtype=torch.float32)
|
|
encoded_data[:, 0] = data[:, 0]
|
|
|
|
one_hot_order_type = torch.nn.functional.one_hot((data[:, 1]).to(torch.int64), num_classes=3).to(
|
|
torch.float32)
|
|
encoded_data[:, 1:4] = one_hot_order_type
|
|
encoded_data[:, 4:] = data[:, 2:]
|
|
return encoded_data
|
|
|
|
|
|
def tanh_encoding_type(data):
|
|
data[:, 1] = torch.where(data[:, 1] == 1.0, 2.0, torch.where(data[:, 1] == 2.0, 1.0, data[:, 1]))
|
|
data[:, 1] = data[:, 1] - 1
|
|
return data
|
|
|
|
|
|
def to_sparse_representation(lob, n_levels):
|
|
if not isinstance(lob, np.ndarray):
|
|
lob = np.array(lob)
|
|
sparse_lob = np.zeros(n_levels * 2)
|
|
for j in range(lob.shape[0] // 2):
|
|
if j % 2 == 0:
|
|
ask_price = lob[0]
|
|
current_ask_price = lob[j*2]
|
|
depth = (current_ask_price - ask_price) // 100
|
|
if depth < n_levels and int(lob[j*2]) != 0:
|
|
sparse_lob[2*int(depth)] = lob[j*2+1]
|
|
else:
|
|
bid_price = lob[2]
|
|
current_bid_price = lob[j*2]
|
|
depth = (bid_price - current_bid_price) // 100
|
|
if depth < n_levels and int(lob[j*2]) != 0:
|
|
sparse_lob[2*int(depth)+1] = lob[j*2+1]
|
|
return sparse_lob
|
|
|