LeonardoBerti commited on 12 days ago

Commit

69524d0

verified ·

1 Parent(s): f34e52f

Upload 51 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
.gitignore +12 -0
.vscode/launch.json +21 -0
Architecture.png +3 -0
LICENSE +21 -0
README.md +106 -0
config/__pycache__/config.cpython-310.pyc +0 -0
config/__pycache__/config.cpython-311.pyc +0 -0
config/config.py +81 -0
constants.py +64 -0
data/checkpoints/TLOB/HuggingFace/FI-2010_horizon_10_TLOB_seed_42.ckpt +3 -0
data/checkpoints/TLOB/HuggingFace/FI-2010_horizon_1_TLOB_seed_42.ckpt +3 -0
data/checkpoints/TLOB/HuggingFace/FI-2010_horizon_2_TLOB_seed_42.ckpt +3 -0
data/checkpoints/TLOB/HuggingFace/FI-2010_horizon_5_TLOB_seed_42.ckpt +3 -0
fI-2010.png +3 -0
main.py +75 -0
models/__pycache__/bin.cpython-310.pyc +0 -0
models/__pycache__/bin.cpython-311.pyc +0 -0
models/__pycache__/binctabl.cpython-311.pyc +0 -0
models/__pycache__/deeplob.cpython-311.pyc +0 -0
models/__pycache__/engine.cpython-310.pyc +0 -0
models/__pycache__/engine.cpython-311.pyc +0 -0
models/__pycache__/mlp.cpython-310.pyc +0 -0
models/__pycache__/mlp.cpython-311.pyc +0 -0
models/__pycache__/transformer.cpython-311.pyc +0 -0
models/bin.py +87 -0
models/binctabl.py +129 -0
models/deeplob.py +102 -0
models/engine.py +294 -0
models/mlplob.py +83 -0
models/tlob.py +177 -0
preprocessing/__pycache__/dataset.cpython-310.pyc +0 -0
preprocessing/__pycache__/dataset.cpython-311.pyc +0 -0
preprocessing/__pycache__/fi_2010.cpython-310.pyc +0 -0
preprocessing/__pycache__/fi_2010.cpython-311.pyc +0 -0
preprocessing/__pycache__/lobster.cpython-310.pyc +0 -0
preprocessing/__pycache__/lobster.cpython-311.pyc +0 -0
preprocessing/dataset.py +87 -0
preprocessing/fi_2010.py +53 -0
preprocessing/lobster.py +324 -0
requirements.txt +19 -0
run.py +434 -0
tslaintc.png +0 -0
utils/__pycache__/utils_data.cpython-311.pyc +0 -0
utils/__pycache__/utils_model.cpython-310.pyc +0 -0
utils/__pycache__/utils_model.cpython-311.pyc +0 -0
utils/utils_data.py +238 -0
utils/utils_model.py +18 -0
visualizations/__pycache__/attentions.cpython-311.pyc +0 -0
visualizations/attentions.py +30 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,4 @@

+*.ckpt filter=lfs diff=lfs merge=lfs -text
+data/checkpoints/TLOB/HuggingFace/*.ckpt filter=lfs diff=lfs merge=lfs -text
+Architecture.png filter=lfs diff=lfs merge=lfs -text
+fI-2010.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+wandb
+outputs
+.hydra
+__pycache__
+lightning_logs
+data/d*
+data/I*
+data/T*
+data/F*
+preprocessing
+env
+data/checkpoints/TLOB/v*

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "args": [
+                "+model=mlplob",
+                "hydra.job.chdir=False",
+                "hydra.run.dir=.",
+            ]
+        }
+    ]
+}

Architecture.png ADDED Viewed

Git LFS Details

SHA256: 53c273d130efbb242654fde206303dfe7a6f5e62f6548639d40178191967de56
Pointer size: 131 Bytes
Size of remote file: 755 kB

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Leonardo Berti
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,106 @@

+# TLOB: A Novel Transformer Model with Dual Attention for Stock Price Trend Prediction with Limit Order Book Data
+This is the official repository for the paper TLOB: A Novel Transformer Model with Dual Attention for Stock Price Trend Prediction with Limit Order Book Data.
+![TLOB Architecture](https://github.com/LeonardoBerti00/TLOB/blob/main/Architecture.png)
+## Abstract
+Stock Price Trend Prediction (SPTP) based on Limit Order Book
+(LOB) data is a fundamental challenge in financial markets. Despite advances in deep learning, existing models fail to generalize across different market conditions and struggle to predict short-term trends reliably. Surprisingly, by adapting a simple MLP-based architecture to LOB, we show that we surpass SoTA performance, thus challenging the necessity of complex architectures. Unlike past work that shows robustness issues, we propose TLOB, a transformer-based model that uses a dual attention mechanism to capture spatial and temporal dependencies in LOB data. This allows it to adaptively focus on the market microstructure, making it particularly effective for longer-horizon predictions and volatile market conditions. We also introduce a new labeling method that improves on previous ones, removing the horizon bias. We evaluate LOB’s effectiveness using the established FI-2010 benchmark, which exceeds the state-of-the-art by an average of 3.7 F1-score(%). Additionally, TLOB shows improvements on Tesla and Intel with a 1.3 and 7.7 increase in F1-score(%), respectively. Additionally, we empirically show how stock price predictability has declined over time (-6.68 absolute points in F1-score(%)), highlighting the growing market efficiencies. Predictability must be considered in relation to transaction costs, so we experimented with defining trends using an average spread, reflecting the primary transaction cost. The resulting performance deterioration  underscores the complexity of translating trend classification into profitable trading strategies. We argue that our work provides new insights into the evolving landscape of stock price trend prediction and sets a strong foundation for future advancements in financial AI.
+# Getting Started
+These instructions will get you a copy of the project up and running on your local machine for development and reproducibility purposes.
+## Prerequisities
+This project requires Python and pip. If you don't have them installed, please do so first. It is possible to do it using conda, but in that case, you are on your own.
+## Installing
+To set up the environment for this project, follow these steps:
+1. Clone the repository:
+```sh
+git clone https://github.com/LeonardoBerti00/TLOB.git
+```
+2. Navigate to the project directory
+3. Create a virtual environment:
+```sh
+python -m venv env
+```
+4. Activate the new Conda environment:
+```sh
+env\Scripts\activate
+```
+5. Download the necessary packages:
+```sh
+pip install -r requirements.txt
+```
+# Reproduce the results
+To reproduce the results follow the following steps:
+1. Download the dataset from the [official website](https://etsin.fairdata.fi/dataset/73eb48d7-4dbc-4a10-a52a-da745b47a649/data).
+2. Unzip the data
+3. Create a folder FI-2010 inside your repository
+4. Copy these four files in the folder: "Test_Dst_NoAuction_ZScore_CF_7.txt", "Test_Dst_NoAuction_ZScore_CF_8.txt", "Test_Dst_NoAuction_ZScore_CF_9", "Train_Dst_NoAuction_ZScore_CF_7.txt" you can delete the other files.
+5. In data/checkpoints/TLOB/HuggingFace/ you can find the four checkpoint for FI-2010, the checkpoints for TSLA and INTC did not fit in the free GitHub repository size. If you need also the other checkpoints you can contact me.
+6. Finally, inside the config file, you need to set Dataset to FI-2010, set the horizons to 1, 2, 5, or 10, then set the checkpoint_reference variable to the path of the checkpoint with the same horizon, finally set the type to EVALUATION.
+7. Now run:
+```sh
+python main.py +model=tlob hydra.job.chdir=False
+```
+Note that the horizons in the paper are an order of magnitude higher because in the paper the value represent the horizons before the sampling process of the dataset. In fact, the dataset is sampled every 10 events.
+# Training
+If your objective is to train a TLOB or MLPLOB model or implement your model you should follow those steps.
+## Data
+If you have some LOBSTER data you can follow those steps:
+1. The format of the data should be the same of LOBSTER: f"{year}-{month}-{day}_34200000_57600000_{type}" and the data should be saved in f"data/{stock_name}/{stock_name}_{year}-{start_month}-{start_day}_{year}-{end_month}-{end_day}". Type can be or message or orderbook.
+2. Inside the config file, you need to set the name of the training stock and the testing stocks, and also the dataset to LOBSTER. Currently you can add only one for the training but several for testing.
+3. You need to start the pre-processing step, to do so set config.is_data_preprocessed to False and run python main.py
+Otherwise, if you want to train and test the model with the Benchmark dataset FI-2010 you can follow these steps:
+1. Download the dataset from the [official website](https://etsin.fairdata.fi/dataset/73eb48d7-4dbc-4a10-a52a-da745b47a649/data).
+2. Unzip the data
+3. Create a folder FI-2010 inside your repository
+4. Copy these four files in the folder: "Test_Dst_NoAuction_ZScore_CF_7.txt", "Test_Dst_NoAuction_ZScore_CF_8.txt", "Test_Dst_NoAuction_ZScore_CF_9", "Train_Dst_NoAuction_ZScore_CF_7.txt" you can delete the other files.
+5. Finally, inside the config file, you need to set Dataset to FI-2010 and set the horizons to 1, 2, 5, or 10.
+Note that the horizons in the paper are an order of magnitude higher because in the paper the value represent the horizons before the sampling process of the dataset. In fact, the dataset is sampled every 10 events.
+## Training a TLOB, MLPLOB, DeepLOB or BiNCTABL Model
+To train a TLOB, MLPLOB, DeepLOB or BiNCTABL Model, you need to set the type variable in the config file to TRAINING, then run this command:
+```sh
+python main.py +model={model_name} hydra.job.chdir=False
+```
+you can see all the model names in the config file.
+## Implementing and Training a new model
+To implement a new model, follow these steps:
+1. Implement your model class in the models/ directory. Your model class will take in input an input of dimension [batch_size, seq_len, num_features], and should output a tensor of dimension [batch_size, 3].
+2. add your model to pick_model in utils_models.
+3. Update the config file to include your model and its hyperparameters. If you are using the FI-2010 dataset, It is suggested to set the hidden dim to 40 and the hp all_features to false if you want to use only the LOB as input or if you want to use the LOB and market features the hidden dim should be 144 and all features true. If you are using LOBSTER data, it is suggested to set the hidden dim to 46 and all features to true to use LOB and orders, while if you want to use only the LOB set all features to False.
+4. Add your model with cs.store, similar to the other models
+5. Run the training script:
+```sh
+python main.py +model={your_model_name} hydra.job.chdir=False
+```
+6. You can set whatever configuration using the hydra style of prompt.
+7. A checkpoint will be saved in data/checkpoints/
+Optionally you can also log the run with wandb or run a sweep, changing the config experiment options.
+# Results
+MLPLOB and TLOB outperform all the other SoTA deep learning models for Stock Price Trend Prediction with LOB data for both datasets, FI-2010 benchmark and TSLA-INTC.
+![FI-2010 results](https://github.com/LeonardoBerti00/TLOB/blob/main/fI-2010.png)
+![TSLA and INTC results](https://github.com/LeonardoBerti00/TLOB/blob/main/tslaintc.png)
+# Citation
+```sh
+@misc{berti2025tlobnoveltransformermodel,
+      title={TLOB: A Novel Transformer Model with Dual Attention for Stock Price Trend Prediction with Limit Order Book Data},
+      author={Leonardo Berti and Gjergji Kasneci},
+      year={2025},
+      eprint={2502.15757},
+      archivePrefix={arXiv},
+      primaryClass={q-fin.ST},
+      url={https://arxiv.org/abs/2502.15757},
+}
+```

config/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (4.55 kB). View file

config/__pycache__/config.cpython-311.pyc ADDED Viewed

Binary file (7.64 kB). View file

config/config.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from typing import List
+from hydra.core.config_store import ConfigStore
+from dataclasses import dataclass, field
+from constants import Dataset, ModelType
+from omegaconf import MISSING, OmegaConf
+@dataclass
+class Model:
+    hyperparameters_fixed: dict = MISSING
+    hyperparameters_sweep: dict = MISSING
+    type: ModelType = MISSING
+@dataclass
+class MLPLOB(Model):
+    hyperparameters_fixed: dict = field(default_factory=lambda: {"num_layers": 3, "hidden_dim": 144, "lr": 0.0003, "seq_size": 384, "all_features": True})
+    hyperparameters_sweep: dict = field(default_factory=lambda: {"num_layers": [3, 6], "hidden_dim": [128], "lr": [0.0003], "seq_size": [384]})
+    type: ModelType = ModelType.MLPLOB
+@dataclass
+class TLOB(Model):
+    hyperparameters_fixed: dict = field(default_factory=lambda: {"num_layers": 4, "hidden_dim": 144, "num_heads": 1, "is_sin_emb": True, "lr": 0.0001, "seq_size": 128, "all_features": True})
+    hyperparameters_sweep: dict = field(default_factory=lambda: {"num_layers": [4, 6], "hidden_dim": [128, 256], "num_heads": [1], "is_sin_emb": [True], "lr": [0.0001], "seq_size": [128]})
+    type: ModelType = ModelType.TLOB
+@dataclass
+class BiNCTABL(Model):
+    hyperparameters_fixed: dict = field(default_factory=lambda: {"lr": 0.001, "seq_size": 10, "all_features": False})
+    hyperparameters_sweep: dict = field(default_factory=lambda: {"lr": [0.001], "seq_size": [10]})
+    type: ModelType = ModelType.BINCTABL
+@dataclass
+class DeepLOB(Model):
+    hyperparameters_fixed: dict = field(default_factory=lambda: {"lr": 0.01, "seq_size": 100, "all_features": False})
+    hyperparameters_sweep: dict = field(default_factory=lambda: {"lr": [0.01], "seq_size": [100]})
+    type: ModelType = ModelType.DEEPLOB
+@dataclass
+class Experiment:
+    is_data_preprocessed: bool = True
+    is_wandb: bool = False
+    is_sweep: bool = False
+    type: list = field(default_factory=lambda: ["EVALUATION"])
+    is_debug: bool = False
+    checkpoint_reference: str = "data/checkpoints/TLOB/val_loss=0.188_epoch=4_FI-2010_seq_size_128_horizon_10_nu_4_hi_144_nu_1_is_True_lr_0.0001_se_128_al_True_ty_TLOB_seed_42.ckpt"
+    dataset_type: Dataset = Dataset.FI_2010
+    sampling_type: str = "quantity"    #time or quantity
+    sampling_time: str = ""   #seconds
+    sampling_quantity: int = 500
+    training_stocks: list = field(default_factory=lambda: ["INTC"])
+    testing_stocks: list = field(default_factory=lambda: ["INTC"])
+    seed: int = 22
+    horizon: int = 5
+    max_epochs: int = 10
+    if dataset_type == Dataset.FI_2010:
+        batch_size: int = 32
+    else:
+        batch_size: int = 128
+    filename_ckpt: str = "model.ckpt"
+    optimizer: str = "Adam"
+defaults = [Model, Experiment]
+@dataclass
+class Config:
+    model: Model
+    experiment: Experiment = field(default_factory=Experiment)
+    defaults: List = field(default_factory=lambda: [
+        {"hydra/job_logging": "disabled"},
+        {"hydra/hydra_logging": "disabled"},
+        "_self_"
+    ])
+cs = ConfigStore.instance()
+cs.store(name="config", node=Config)
+cs.store(group="model", name="mlplob", node=MLPLOB)
+cs.store(group="model", name="tlob", node=TLOB)
+cs.store(group="model", name="binctabl", node=BiNCTABL)
+cs.store(group="model", name="deeplob", node=DeepLOB)

constants.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+from enum import Enum
+class Dataset(Enum):
+    LOBSTER = "LOBSTER"
+    FI_2010 = "FI-2010"
+class ModelType(Enum):
+    MLPLOB = "MLPLOB"
+    TLOB = "TLOB"
+    BINCTABL = "BINCTABL"
+    DEEPLOB = "DEEPLOB"
+# for 15 days of TSLA
+TSLA_LOB_MEAN_SIZE_10 = 165.44670902537212
+TSLA_LOB_STD_SIZE_10 = 481.7127061897184
+TSLA_LOB_MEAN_PRICE_10 = 20180.439318660694
+TSLA_LOB_STD_PRICE_10 = 814.8782058033195
+TSLA_EVENT_MEAN_SIZE = 88.09459295373463
+TSLA_EVENT_STD_SIZE = 86.55913199110894
+TSLA_EVENT_MEAN_PRICE = 20178.610720500274
+TSLA_EVENT_STD_PRICE = 813.8188032145645
+TSLA_EVENT_MEAN_TIME = 0.08644932804905886
+TSLA_EVENT_STD_TIME = 0.3512181506722207
+TSLA_EVENT_MEAN_DEPTH = 7.365325300819055
+TSLA_EVENT_STD_DEPTH = 8.59342838063813
+# for 15 days of INTC
+INTC_LOB_MEAN_SIZE_10 = 6222.424274871972
+INTC_LOB_STD_SIZE_10 = 7538.341086370264
+INTC_LOB_MEAN_PRICE_10 = 3635.766219937785
+INTC_LOB_STD_PRICE_10 = 44.15649995373795
+INTC_EVENT_MEAN_SIZE = 324.6800802006092
+INTC_EVENT_STD_SIZE = 574.5781447696605
+INTC_EVENT_MEAN_PRICE = 3635.78165265669
+INTC_EVENT_STD_PRICE = 43.872407609651184
+INTC_EVENT_MEAN_TIME = 0.025201754040915927
+INTC_EVENT_STD_TIME = 0.11013627432323592
+INTC_EVENT_MEAN_DEPTH = 1.3685517399834501
+INTC_EVENT_STD_DEPTH = 2.333747222206966
+LOBSTER_HORIZONS = [10, 20, 50, 100]
+PRECISION = 32
+N_LOB_LEVELS = 10
+LEN_LEVEL = 4
+LEN_ORDER = 6
+LEN_SMOOTH = 10
+DATE_TRADING_DAYS = ["2015-01-02", "2015-01-30"]
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+DIR_EXPERIMENTS = "data/experiments"
+DIR_SAVED_MODEL = "data/checkpoints"
+DATA_DIR = "data"
+RECON_DIR = "data/reconstructions"
+PROJECT_NAME = ""
+SPLIT_RATES = [0.8, 0.1, 0.1]

data/checkpoints/TLOB/HuggingFace/FI-2010_horizon_10_TLOB_seed_42.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcf32a73fe974b57778d0436132f18a76c3e71f38af2fdc1068f711bc28bcd96
+size 32098955

data/checkpoints/TLOB/HuggingFace/FI-2010_horizon_1_TLOB_seed_42.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f90706b3df6cfebb855e6a2ff3b8b32ce7c1e3301067ad8ef26c36214214e8ca
+size 32098955

data/checkpoints/TLOB/HuggingFace/FI-2010_horizon_2_TLOB_seed_42.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:641014f18977c93c6502a9c21f579d20ce9edc7e3d64b63c90716b5682273c57
+size 32098955

data/checkpoints/TLOB/HuggingFace/FI-2010_horizon_5_TLOB_seed_42.ckpt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d606d22ac0f1fad2238787a30d111af7dc975d0ea23c48a11c4c797db74bf5d
+size 32098955

fI-2010.png ADDED Viewed

Git LFS Details

SHA256: 051ada3c1e457825875ae998006742309535424186025e8c61a7c703fff9fa00
Pointer size: 131 Bytes
Size of remote file: 101 kB

main.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import random
+import warnings
+warnings.filterwarnings("ignore")
+import numpy as np
+import torchvision
+import wandb
+import torch
+torchvision.disable_beta_transforms_warning()
+import constants as cst
+import hydra
+from config.config import Config
+from run import run_wandb, run, sweep_init
+from preprocessing.lobster import LOBSTERDataBuilder
+from constants import Dataset
+from config.config import MLPLOB, TLOB
+@hydra.main(config_path="config", config_name="config")
+def hydra_app(config: Config):
+    set_reproducibility(config.experiment.seed)
+    if (cst.DEVICE == "cpu"):
+        accelerator = "cpu"
+    else:
+        accelerator = "gpu"
+    if config.experiment.dataset_type == Dataset.FI_2010:
+        config.experiment.batch_size = 32
+        if config.model.type.value == "MLPLOB" or config.model.type.value == "TLOB":
+            config.model.hyperparameters_fixed["hidden_dim"] = 144
+    else:
+        config.experiment.batch_size = 128
+        if config.model.type.value == "MLPLOB" or config.model.type.value == "TLOB":
+            config.model.hyperparameters_fixed["hidden_dim"] = 46
+    if config.experiment.dataset_type.value == "LOBSTER" and not config.experiment.is_data_preprocessed:
+        # prepare the datasets, this will save train.npy, val.npy and test.npy in the data directory
+        data_builder = LOBSTERDataBuilder(
+            stocks=config.experiment.training_stocks,
+            data_dir=cst.DATA_DIR,
+            date_trading_days=cst.DATE_TRADING_DAYS,
+            split_rates=cst.SPLIT_RATES,
+            sampling_type=config.experiment.sampling_type,
+            sampling_time=config.experiment.sampling_time,
+            sampling_quantity=config.experiment.sampling_quantity,
+        )
+        data_builder.prepare_save_datasets()
+    if config.experiment.is_wandb:
+        if config.experiment.is_sweep:
+            sweep_config = sweep_init(config)
+            sweep_id = wandb.sweep(sweep_config, project=cst.PROJECT_NAME, entity="")
+            wandb.agent(sweep_id, run_wandb(config, accelerator), count=sweep_config["run_cap"])
+        else:
+            start_wandb = run_wandb(config, accelerator)
+            start_wandb()
+    # training without using wandb
+    else:
+        run(config, accelerator)
+def set_reproducibility(seed):
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+def set_torch():
+    torch.set_default_dtype(torch.float32)
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.allow_tf32 = True
+    torch.autograd.set_detect_anomaly(False)
+    torch.set_float32_matmul_precision('high')
+if __name__ == "__main__":
+    set_torch()
+    hydra_app()

models/__pycache__/bin.cpython-310.pyc ADDED Viewed

Binary file (1.95 kB). View file

models/__pycache__/bin.cpython-311.pyc ADDED Viewed

Binary file (4.98 kB). View file

models/__pycache__/binctabl.cpython-311.pyc ADDED Viewed

Binary file (8.74 kB). View file

models/__pycache__/deeplob.cpython-311.pyc ADDED Viewed

Binary file (5.55 kB). View file

models/__pycache__/engine.cpython-310.pyc ADDED Viewed

Binary file (9.72 kB). View file

models/__pycache__/engine.cpython-311.pyc ADDED Viewed

Binary file (21.4 kB). View file

models/__pycache__/mlp.cpython-310.pyc ADDED Viewed

Binary file (2.47 kB). View file

models/__pycache__/mlp.cpython-311.pyc ADDED Viewed

Binary file (5.73 kB). View file

models/__pycache__/transformer.cpython-311.pyc ADDED Viewed

Binary file (13.2 kB). View file

models/bin.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+from torch import nn
+import constants as cst
+class BiN(nn.Module):
+    def __init__(self, d1, t1):
+        super().__init__()
+        self.t1 = t1
+        self.d1 = d1
+        bias1 = torch.Tensor(t1, 1)
+        self.B1 = nn.Parameter(bias1)
+        nn.init.constant_(self.B1, 0)
+        l1 = torch.Tensor(t1, 1)
+        self.l1 = nn.Parameter(l1)
+        nn.init.xavier_normal_(self.l1)
+        bias2 = torch.Tensor(d1, 1)
+        self.B2 = nn.Parameter(bias2)
+        nn.init.constant_(self.B2, 0)
+        l2 = torch.Tensor(d1, 1)
+        self.l2 = nn.Parameter(l2)
+        nn.init.xavier_normal_(self.l2)
+        y1 = torch.Tensor(1, )
+        self.y1 = nn.Parameter(y1)
+        nn.init.constant_(self.y1, 0.5)
+        y2 = torch.Tensor(1, )
+        self.y2 = nn.Parameter(y2)
+        nn.init.constant_(self.y2, 0.5)
+    def forward(self, x):
+        # if the two scalars are negative then we setting them to 0
+        if (self.y1[0] < 0):
+            y1 = torch.cuda.FloatTensor(1, )
+            self.y1 = nn.Parameter(y1)
+            nn.init.constant_(self.y1, 0.01)
+        if (self.y2[0] < 0):
+            y2 = torch.cuda.FloatTensor(1, )
+            self.y2 = nn.Parameter(y2)
+            nn.init.constant_(self.y2, 0.01)
+        # normalization along the temporal dimensione
+        T2 = torch.ones([self.t1, 1], device=cst.DEVICE)
+        x2 = torch.mean(x, dim=2)
+        x2 = torch.reshape(x2, (x2.shape[0], x2.shape[1], 1))
+        std = torch.std(x, dim=2)
+        std = torch.reshape(std, (std.shape[0], std.shape[1], 1))
+        # it can be possible that the std of some temporal slices is 0, and this produces inf values, so we have to set them to one
+        std[std < 1e-4] = 1
+        diff = x - (x2 @ (T2.T))
+        Z2 = diff / (std @ (T2.T))
+        X2 = self.l2 @ T2.T
+        X2 = X2 * Z2
+        X2 = X2 + (self.B2 @ T2.T)
+        # normalization along the feature dimension
+        T1 = torch.ones([self.d1, 1], device=cst.DEVICE)
+        x1 = torch.mean(x, dim=1)
+        x1 = torch.reshape(x1, (x1.shape[0], x1.shape[1], 1))
+        std = torch.std(x, dim=1)
+        std = torch.reshape(std, (std.shape[0], std.shape[1], 1))
+        op1 = x1 @ T1.T
+        op1 = torch.permute(op1, (0, 2, 1))
+        op2 = std @ T1.T
+        op2 = torch.permute(op2, (0, 2, 1))
+        z1 = (x - op1) / (op2)
+        X1 = (T1 @ self.l1.T)
+        X1 = X1 * z1
+        X1 = X1 + (T1 @ self.B1.T)
+        # weighing the imporance of temporal and feature normalization
+        x = self.y1 * X1 + self.y2 * X2
+        return x

models/binctabl.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from torch import nn
+from models.bin import BiN
+import torch
+import constants as cst
+class TABL_layer(nn.Module):
+    def __init__(self, d2, d1, t1, t2):
+        super().__init__()
+        self.t1 = t1
+        weight = torch.Tensor(d2, d1)
+        self.W1 = nn.Parameter(weight)
+        nn.init.kaiming_uniform_(self.W1, nonlinearity='relu')
+        weight2 = torch.Tensor(t1, t1)
+        self.W = nn.Parameter(weight2)
+        nn.init.constant_(self.W, 1/t1)
+        weight3 = torch.Tensor(t1, t2)
+        self.W2 = nn.Parameter(weight3)
+        nn.init.kaiming_uniform_(self.W2, nonlinearity='relu')
+        bias1 = torch.Tensor(d2, t2)
+        self.B = nn.Parameter(bias1)
+        nn.init.constant_(self.B, 0)
+        l = torch.Tensor(1,)
+        self.l = nn.Parameter(l)
+        nn.init.constant_(self.l, 0.5)
+        self.activation = nn.ReLU()
+    def forward(self, X):
+        #maintaining the weight parameter between 0 and 1.
+        if (self.l[0] < 0):
+          l = torch.Tensor(1,).to(cst.DEVICE)
+          self.l = nn.Parameter(l)
+          nn.init.constant_(self.l, 0.0)
+        if (self.l[0] > 1):
+          l = torch.Tensor(1,).to(cst.DEVICE)
+          self.l = nn.Parameter(l)
+          nn.init.constant_(self.l, 1.0)
+        #modelling the dependence along the first mode of X while keeping the temporal order intact (7)
+        X = self.W1 @ X
+        #enforcing constant (1) on the diagonal
+        W = self.W -self.W *torch.eye(self.t1,dtype=torch.float32).to(cst.DEVICE)+torch.eye(self.t1,dtype=torch.float32).to(cst.DEVICE)/self.t1
+        #attention, the aim of the second step is to learn how important the temporal instances are to each other (8)
+        E = X @ W
+        #computing the attention mask  (9)
+        A = torch.softmax(E, dim=-1)
+        #applying a soft attention mechanism  (10)
+        #he attention mask A obtained from the third step is used to zero out the effect of unimportant elements
+        X = self.l[0] * (X) + (1.0 - self.l[0])*X*A
+        #the final step of the proposed layer estimates the temporal mapping W2, after the bias shift (11)
+        y = X @ self.W2 + self.B
+        return y
+class BL_layer(nn.Module):
+  def __init__(self, d2, d1, t1, t2):
+        super().__init__()
+        weight1 = torch.Tensor(d2, d1)
+        self.W1 = nn.Parameter(weight1)
+        nn.init.kaiming_uniform_(self.W1, nonlinearity='relu')
+        weight2 = torch.Tensor(t1, t2)
+        self.W2 = nn.Parameter(weight2)
+        nn.init.kaiming_uniform_(self.W2, nonlinearity='relu')
+        bias1 = torch.zeros((d2, t2))
+        self.B = nn.Parameter(bias1)
+        nn.init.constant_(self.B, 0)
+        self.activation = nn.ReLU()
+  def forward(self, x):
+    x = self.activation(self.W1 @ x @ self.W2 + self.B)
+    return x
+class BiN_CTABL(nn.Module):
+  def __init__(self, d2, d1, t1, t2, d3, t3, d4, t4):
+    super().__init__()
+    self.BiN = BiN(d1, t1)
+    self.BL = BL_layer(d2, d1, t1, t2)
+    self.BL2 = BL_layer(d3, d2, t2, t3)
+    self.TABL = TABL_layer(d4, d3, t3, t4)
+    self.dropout = nn.Dropout(0.1)
+  def forward(self, x):
+    x = x.permute(0, 2, 1)
+    #first of all we pass the input to the BiN layer, then we use the C(TABL) architecture
+    x = self.BiN(x)
+    self.max_norm_(self.BL.W1.data)
+    self.max_norm_(self.BL.W2.data)
+    x = self.BL(x)
+    x = self.dropout(x)
+    self.max_norm_(self.BL2.W1.data)
+    self.max_norm_(self.BL2.W2.data)
+    x = self.BL2(x)
+    x = self.dropout(x)
+    self.max_norm_(self.TABL.W1.data)
+    self.max_norm_(self.TABL.W.data)
+    self.max_norm_(self.TABL.W2.data)
+    x = self.TABL(x)
+    x = torch.squeeze(x)
+    x = torch.softmax(x, 1)
+    return x
+  def max_norm_(self, w):
+    with torch.no_grad():
+      if (torch.linalg.matrix_norm(w) > 10.0):
+        norm = torch.linalg.matrix_norm(w)
+        desired = torch.clamp(norm, min=0.0, max=10.0)
+        w *= (desired / (1e-8 + norm))

models/deeplob.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from torch import nn
+import torch
+class DeepLOB(nn.Module):
+    def __init__(self):
+        super().__init__()
+        # convolution blocks
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(1, 2), stride=(1, 2)),
+            nn.LeakyReLU(negative_slope=0.01),
+            # nn.Tanh(),
+            nn.BatchNorm2d(32),
+            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
+            nn.LeakyReLU(negative_slope=0.01),
+            nn.BatchNorm2d(32),
+            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
+            nn.LeakyReLU(negative_slope=0.01),
+            nn.BatchNorm2d(32),
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1, 2), stride=(1, 2)),
+            nn.Tanh(),
+            nn.BatchNorm2d(32),
+            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
+            nn.Tanh(),
+            nn.BatchNorm2d(32),
+            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
+            nn.Tanh(),
+            nn.BatchNorm2d(32),
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(1, 10)),
+            nn.LeakyReLU(negative_slope=0.01),
+            nn.BatchNorm2d(32),
+            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
+            nn.LeakyReLU(negative_slope=0.01),
+            nn.BatchNorm2d(32),
+            nn.Conv2d(in_channels=32, out_channels=32, kernel_size=(4, 1)),
+            nn.LeakyReLU(negative_slope=0.01),
+            nn.BatchNorm2d(32),
+        )
+        # inception modules
+        self.inp1 = nn.Sequential(
+            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1, 1), padding='same'),
+            nn.LeakyReLU(negative_slope=0.01),
+            nn.BatchNorm2d(64),
+            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3, 1), padding='same'),
+            nn.LeakyReLU(negative_slope=0.01),
+            nn.BatchNorm2d(64),
+        )
+        self.inp2 = nn.Sequential(
+            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1, 1), padding='same'),
+            nn.LeakyReLU(negative_slope=0.01),
+            nn.BatchNorm2d(64),
+            nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(5, 1), padding='same'),
+            nn.LeakyReLU(negative_slope=0.01),
+            nn.BatchNorm2d(64),
+        )
+        self.inp3 = nn.Sequential(
+            nn.MaxPool2d((3, 1), stride=(1, 1), padding=(1, 0)),
+            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(1, 1), padding='same'),
+            nn.LeakyReLU(negative_slope=0.01),
+            nn.BatchNorm2d(64),
+        )
+        # lstm layers
+        self.lstm = nn.LSTM(input_size=192, hidden_size=64, num_layers=1, batch_first=True)
+        self.fc1 = nn.Linear(64, 3)
+        self.softmax = nn.Softmax(dim=1)
+    def forward(self, x):
+        x = x[:, None, :, :]  # none stands for the channel
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x_inp1 = self.inp1(x)
+        x_inp2 = self.inp2(x)
+        x_inp3 = self.inp3(x)
+        x = torch.cat((x_inp1, x_inp2, x_inp3), dim=1)
+        # x = torch.transpose(x, 1, 2)
+        x = x.permute(0, 2, 1, 3)
+        x = torch.reshape(x, (-1, x.shape[1], x.shape[2]))
+        out, _ = self.lstm(x)
+        out = out[:, -1, :]
+        out = self.fc1(out)
+        out = self.softmax(out)
+        return out

models/engine.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import random
+from lightning import LightningModule
+import numpy as np
+from sklearn.metrics import classification_report, precision_recall_curve
+from torch import nn
+import os
+import torch
+import matplotlib.pyplot as plt
+import wandb
+import seaborn as sns
+from lion_pytorch import Lion
+from torch_ema import ExponentialMovingAverage
+from utils.utils_model import pick_model
+import constants as cst
+from scipy.stats import mode
+from visualizations.attentions import plot_mean_att_distance
+class Engine(LightningModule):
+    def __init__(
+        self,
+        seq_size,
+        horizon,
+        max_epochs,
+        model_type,
+        is_wandb,
+        experiment_type,
+        lr,
+        optimizer,
+        filename_ckpt,
+        num_features,
+        dataset_type,
+        num_layers=4,
+        hidden_dim=256,
+        num_heads=8,
+        is_sin_emb=True,
+        len_test_dataloader=None,
+        plot_att=False
+    ):
+        super().__init__()
+        self.seq_size = seq_size
+        self.dataset_type = dataset_type
+        self.horizon = horizon
+        self.max_epochs = max_epochs
+        self.model_type = model_type
+        self.num_heads = num_heads
+        self.is_wandb = is_wandb
+        self.len_test_dataloader = len_test_dataloader
+        self.lr = lr
+        self.optimizer = optimizer
+        self.filename_ckpt = filename_ckpt
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.num_features = num_features
+        self.experiment_type = experiment_type
+        self.model = pick_model(model_type, hidden_dim, num_layers, seq_size, num_features, num_heads, is_sin_emb, dataset_type)
+        self.ema = ExponentialMovingAverage(self.parameters(), decay=0.999)
+        self.ema.to(cst.DEVICE)
+        self.loss_function = nn.CrossEntropyLoss()
+        self.train_losses = []
+        self.val_losses = []
+        self.test_losses = []
+        self.test_targets = []
+        self.test_predictions = []
+        self.test_proba = []
+        self.val_targets = []
+        self.val_loss = np.inf
+        self.val_predictions = []
+        self.min_loss = np.inf
+        self.save_hyperparameters()
+        self.last_path_ckpt = None
+        self.first_test = True
+        self.plot_att = plot_att
+    def forward(self, x, plot_this_att=False, batch_idx=None):
+        if self.model_type == "TLOB":
+            output, att_temporal, att_feature = self.model(x, plot_this_att)
+        else:
+            output = self.model(x)
+        if self.is_wandb and plot_this_att and self.model_type == "TLOB":
+            for l in range(len(att_temporal)):
+                for i in range(self.num_heads):
+                    plt.figure(figsize=(10, 8))
+                    sns.heatmap(att_temporal[l, i], fmt=".2f", cmap="viridis")
+                    plt.title(f'Temporal Attention Layer {l} Head {i}')
+                    wandb.log({f"Temporal Attention Layer {l} Head {i} for batch {batch_idx}": wandb.Image(plt)})
+                    plt.close()
+            for l in range(len(att_feature)):
+                for i in range(self.num_heads):
+                    plt.figure(figsize=(10, 8))
+                    sns.heatmap(att_feature[l, i], fmt=".2f", cmap="viridis")
+                    plt.title(f'Feature Attention Layer {l} Head {i}')
+                    wandb.log({f"Feature Attention Layer {l} Head {i}  for batch {batch_idx}": wandb.Image(plt)})
+                    plt.close()
+        return output
+    def loss(self, y_hat, y):
+        return self.loss_function(y_hat, y)
+    def training_step(self, batch, batch_idx):
+        x, y = batch
+        y_hat = self.forward(x)
+        batch_loss = self.loss(y_hat, y)
+        batch_loss_mean = torch.mean(batch_loss)
+        self.train_losses.append(batch_loss_mean.item())
+        self.ema.update()
+        if batch_idx % 1000 == 0:
+            print(f'train loss: {sum(self.train_losses) / len(self.train_losses)}')
+        return batch_loss_mean
+    def on_train_epoch_start(self) -> None:
+        print(f'learning rate: {self.optimizer.param_groups[0]["lr"]}')
+    def validation_step(self, batch, batch_idx):
+        x, y = batch
+        # Validation: with EMA
+        with self.ema.average_parameters():
+            y_hat = self.forward(x)
+            batch_loss = self.loss(y_hat, y)
+            self.val_targets.append(y.cpu().numpy())
+            self.val_predictions.append(y_hat.argmax(dim=1).cpu().numpy())
+            batch_loss_mean = torch.mean(batch_loss)
+            self.val_losses.append(batch_loss_mean.item())
+        return batch_loss_mean
+    def on_test_epoch_start(self):
+        # Extract 30 random numbers from the length of the test_dataloader
+        random_indices = random.sample(range(self.len_test_dataloader), 5)
+        print(f'Random indices: {random_indices}')
+        self.random_indices = random_indices  # Store the random indices if needed
+        return
+    def test_step(self, batch, batch_idx):
+        x, y = batch
+        # Test: with EMA
+        if batch_idx in self.random_indices and self.model_type == "TLOB" and self.first_test and self.plot_att:
+            plot_this_att = True
+            print(f'Plotting attention for batch {batch_idx}')
+        else:
+            plot_this_att = False
+        if self.experiment_type == "TRAINING":
+            with self.ema.average_parameters():
+                y_hat = self.forward(x, plot_this_att, batch_idx)
+                batch_loss = self.loss(y_hat, y)
+                self.test_targets.append(y.cpu().numpy())
+                self.test_predictions.append(y_hat.argmax(dim=1).cpu().numpy())
+                self.test_proba.append(torch.softmax(y_hat, dim=1)[:, 1].cpu().numpy())
+                batch_loss_mean = torch.mean(batch_loss)
+                self.test_losses.append(batch_loss_mean.item())
+        else:
+            y_hat = self.forward(x, plot_this_att, batch_idx)
+            batch_loss = self.loss(y_hat, y)
+            self.test_targets.append(y.cpu().numpy())
+            self.test_predictions.append(y_hat.argmax(dim=1).cpu().numpy())
+            self.test_proba.append(torch.softmax(y_hat, dim=1)[:, 1].cpu().numpy())
+            batch_loss_mean = torch.mean(batch_loss)
+            self.test_losses.append(batch_loss_mean.item())
+        return batch_loss_mean
+    def on_validation_epoch_start(self) -> None:
+        loss = sum(self.train_losses) / len(self.train_losses)
+        self.train_losses = []
+        if self.is_wandb:
+            wandb.log({"train_loss": loss})
+        print(f'Train loss on epoch {self.current_epoch}: {loss}')
+    def on_validation_epoch_end(self) -> None:
+        self.val_loss = sum(self.val_losses) / len(self.val_losses)
+        self.val_losses = []
+        # model checkpointing
+        if self.val_loss < self.min_loss:
+            # if the improvement is less than 0.0005, we halve the learning rate
+            if self.val_loss - self.min_loss > -0.001:
+                self.optimizer.param_groups[0]["lr"] /= 2
+            self.min_loss = self.val_loss
+            self.model_checkpointing(self.val_loss)
+        else:
+            self.optimizer.param_groups[0]["lr"] /= 2
+        self.log("val_loss", self.val_loss)
+        print(f'Validation loss on epoch {self.current_epoch}: {self.val_loss}')
+        targets = np.concatenate(self.val_targets)
+        predictions = np.concatenate(self.val_predictions)
+        class_report = classification_report(targets, predictions, digits=4, output_dict=True)
+        print(classification_report(targets, predictions, digits=4))
+        self.log("val_f1_score", class_report["macro avg"]["f1-score"])
+        self.log("val_accuracy", class_report["accuracy"])
+        self.log("val_precision", class_report["macro avg"]["precision"])
+        self.log("val_recall", class_report["macro avg"]["recall"])
+        self.val_targets = []
+        self.val_predictions = []
+    def on_test_epoch_end(self) -> None:
+        targets = np.concatenate(self.test_targets)
+        predictions = np.concatenate(self.test_predictions)
+        class_report = classification_report(targets, predictions, digits=4, output_dict=True)
+        print(classification_report(targets, predictions, digits=4))
+        self.log("test_loss", sum(self.test_losses) / len(self.test_losses))
+        self.log("f1_score", class_report["macro avg"]["f1-score"])
+        self.log("accuracy", class_report["accuracy"])
+        self.log("precision", class_report["macro avg"]["precision"])
+        self.log("recall", class_report["macro avg"]["recall"])
+        filename_ckpt = ("val_loss=" + str(round(self.val_loss, 3)) +
+                             "_epoch=" + str(self.current_epoch) +
+                             "_" + self.filename_ckpt +
+                             "last.ckpt"
+                             )
+        path_ckpt = cst.DIR_SAVED_MODEL + "/" + str(self.model_type) + "/" + filename_ckpt
+        self.test_targets = []
+        self.test_predictions = []
+        self.test_losses = []
+        self.first_test = False
+        test_proba = np.concatenate(self.test_proba)
+        precision, recall, _ = precision_recall_curve(targets, test_proba, pos_label=1)
+        self.plot_pr_curves(recall, precision, self.is_wandb)
+        with self.ema.average_parameters():
+            self.trainer.save_checkpoint(path_ckpt)
+        if self.model_type == "TLOB" and self.plot_att:
+            plot = plot_mean_att_distance(np.array(self.model.mean_att_distance_temporal).mean(axis=0))
+            if self.is_wandb:
+                wandb.log({"mean_att_distance": wandb.Image(plot)})
+    def configure_optimizers(self):
+        if self.model_type == "DEEPLOB":
+            eps = 1
+        else:
+            eps = 1e-8
+        if self.optimizer == 'Adam':
+            self.optimizer = torch.optim.Adam(self.parameters(), lr=self.lr, eps=eps)
+        elif self.optimizer == 'SGD':
+            self.optimizer = torch.optim.SGD(self.parameters(), lr=self.lr, momentum=0.9)
+        elif self.optimizer == 'Lion':
+            self.optimizer = Lion(self.parameters(), lr=self.lr)
+        return self.optimizer
+    def _define_log_metrics(self):
+        wandb.define_metric("val_loss", summary="min")
+    def model_checkpointing(self, loss):
+        if self.last_path_ckpt is not None:
+            os.remove(self.last_path_ckpt)
+        filename_ckpt = ("val_loss=" + str(round(loss, 3)) +
+                             "_epoch=" + str(self.current_epoch) +
+                             "_" + self.filename_ckpt +
+                             ".ckpt"
+                             )
+        path_ckpt = cst.DIR_SAVED_MODEL + "/" + str(self.model_type) + "/" + filename_ckpt
+        with self.ema.average_parameters():
+            self.trainer.save_checkpoint(path_ckpt)
+        self.last_path_ckpt = path_ckpt
+    def plot_pr_curves(self, recall, precision, is_wandb):
+        plt.figure(figsize=(20, 10), dpi=80)
+        plt.plot(recall, precision, label='Precision-Recall', color='black')
+        plt.xlabel('Recall')
+        plt.ylabel('Precision')
+        plt.title('Precision-Recall Curve')
+        if is_wandb:
+            wandb.log({f"precision_recall_curve_{self.dataset_type}": wandb.Image(plt)})
+        plt.savefig(cst.DIR_SAVED_MODEL + "/" + str(self.model_type) + "/" +f"precision_recall_curve_{self.dataset_type}.svg")
+        #plt.show()
+        plt.close()
+def compute_most_attended(att_feature):
+    ''' att_feature: list of tensors of shape (num_samples, num_layers, 2, num_heads, num_features) '''
+    att_feature = np.stack(att_feature)
+    att_feature = att_feature.transpose(1, 3, 0, 2, 4)  # Use transpose instead of permute
+    ''' att_feature: shape (num_layers, num_heads, num_samples, 2, num_features) '''
+    indices = att_feature[:, :, :, 1]
+    values = att_feature[:, :, :, 0]
+    most_frequent_indices = np.zeros((indices.shape[0], indices.shape[1], indices.shape[3]), dtype=int)
+    average_values = np.zeros((indices.shape[0], indices.shape[1], indices.shape[3]))
+    for layer in range(indices.shape[0]):
+        for head in range(indices.shape[1]):
+            for seq in range(indices.shape[3]):
+                # Extract the indices for the current layer and sequence element
+                current_indices = indices[layer, head, :, seq]
+                current_values = values[layer, head, :, seq]
+                # Find the most frequent index
+                most_frequent_index = mode(current_indices, keepdims=False)[0]
+                # Store the result
+                most_frequent_indices[layer, head, seq] = most_frequent_index
+                # Compute the average value for the most frequent index
+                avg_value = np.mean(current_values[current_indices == most_frequent_index])
+                # Store the average value
+                average_values[layer, head, seq] = avg_value
+    return most_frequent_indices, average_values

models/mlplob.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from torch import nn
+import torch
+from models.bin import BiN
+class MLPLOB(nn.Module):
+    def __init__(self,
+                 hidden_dim: int,
+                 num_layers: int,
+                 seq_size: int,
+                 num_features: int,
+                 dataset_type: str
+                 ) -> None:
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.dataset_type = dataset_type
+        self.layers = nn.ModuleList()
+        self.order_type_embedder = nn.Embedding(3, 1)
+        self.first_layer = nn.Linear(num_features, hidden_dim)
+        self.norm_layer = BiN(num_features, seq_size)
+        self.layers.append(self.first_layer)
+        self.layers.append(nn.GELU())
+        for i in range(num_layers):
+            if i != num_layers-1:
+                self.layers.append(MLP(hidden_dim, hidden_dim*4, hidden_dim))
+                self.layers.append(MLP(seq_size, seq_size*4, seq_size))
+            else:
+                self.layers.append(MLP(hidden_dim, hidden_dim*2, hidden_dim//4))
+                self.layers.append(MLP(seq_size, seq_size*2, seq_size//4))
+        total_dim = (hidden_dim//4)*(seq_size//4)
+        self.final_layers = nn.ModuleList()
+        while total_dim > 128:
+            self.final_layers.append(nn.Linear(total_dim, total_dim//4))
+            self.final_layers.append(nn.GELU())
+            total_dim = total_dim//4
+        self.final_layers.append(nn.Linear(total_dim, 3))
+    def forward(self, input):
+        if self.dataset_type == "LOBSTER":
+            continuous_features = torch.cat([input[:, :, :41], input[:, :, 42:]], dim=2)
+            order_type = input[:, :, 41].long()
+            order_type_emb = self.order_type_embedder(order_type).detach()
+            x = torch.cat([continuous_features, order_type_emb], dim=2)
+        else:
+            x = input
+        x = x.permute(0, 2, 1)
+        x = self.norm_layer(x)
+        x = x.permute(0, 2, 1)
+        for layer in self.layers:
+            x = layer(x)
+            x = x.permute(0, 2, 1)
+        x = x.reshape(x.shape[0], -1)
+        for layer in self.final_layers:
+            x = layer(x)
+        return x
+class MLP(nn.Module):
+    def __init__(self,
+                 start_dim: int,
+                 hidden_dim: int,
+                 final_dim: int
+                 ) -> None:
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(final_dim)
+        self.fc = nn.Linear(start_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, final_dim)
+        self.gelu = nn.GELU()
+    def forward(self, x):
+        residual = x
+        x = self.fc(x)
+        x = self.gelu(x)
+        x = self.fc2(x)
+        if x.shape[2] == residual.shape[2]:
+            x = x + residual
+        x = self.layer_norm(x)
+        x = self.gelu(x)
+        return x

models/tlob.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from torch import nn
+import torch
+from einops import rearrange
+import constants as cst
+from models.bin import BiN
+from models.mlplob import MLP
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+class ComputeQKV(nn.Module):
+    def __init__(self, hidden_dim: int, num_heads: int):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.q = nn.Linear(hidden_dim, hidden_dim*num_heads)
+        self.k = nn.Linear(hidden_dim, hidden_dim*num_heads)
+        self.v = nn.Linear(hidden_dim, hidden_dim*num_heads)
+    def forward(self, x):
+        q = self.q(x)
+        k = self.k(x)
+        v = self.v(x)
+        return q, k, v
+class TransformerLayer(nn.Module):
+    def __init__(self, hidden_dim: int, num_heads: int, final_dim: int):
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_heads = num_heads
+        self.norm = nn.LayerNorm(hidden_dim)
+        self.qkv = ComputeQKV(hidden_dim, num_heads)
+        self.attention = nn.MultiheadAttention(hidden_dim*num_heads, num_heads, batch_first=True, device=cst.DEVICE)
+        self.mlp = MLP(hidden_dim, hidden_dim*4, final_dim)
+        self.w0 = nn.Linear(hidden_dim*num_heads, hidden_dim)
+    def forward(self, x):
+        res = x
+        q, k, v = self.qkv(x)
+        x, att = self.attention(q, k, v, average_attn_weights=False, need_weights=True)
+        x = self.w0(x)
+        x = x + res
+        x = self.norm(x)
+        x = self.mlp(x)
+        if x.shape[-1] == res.shape[-1]:
+            x = x + res
+        return x, att
+class TLOB(nn.Module):
+    def __init__(self,
+                 hidden_dim: int,
+                 num_layers: int,
+                 seq_size: int,
+                 num_features: int,
+                 num_heads: int,
+                 is_sin_emb: bool,
+                 dataset_type: str
+                 ) -> None:
+        super().__init__()
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        self.is_sin_emb = is_sin_emb
+        self.seq_size = seq_size
+        self.num_heads = num_heads
+        self.dataset_type = dataset_type
+        self.layers = nn.ModuleList()
+        self.first_branch = nn.ModuleList()
+        self.second_branch = nn.ModuleList()
+        self.order_type_embedder = nn.Embedding(3, 1)
+        self.norm_layer = BiN(num_features, seq_size)
+        self.emb_layer = nn.Linear(num_features, hidden_dim)
+        if is_sin_emb:
+            self.pos_encoder = sinusoidal_positional_embedding(seq_size, hidden_dim)
+        else:
+            self.pos_encoder = nn.Parameter(torch.randn(1, seq_size, hidden_dim))
+        for i in range(num_layers):
+            if i != num_layers-1:
+                self.layers.append(TransformerLayer(hidden_dim, num_heads, hidden_dim))
+                self.layers.append(TransformerLayer(seq_size, num_heads, seq_size))
+            else:
+                self.layers.append(TransformerLayer(hidden_dim, num_heads, hidden_dim//4))
+                self.layers.append(TransformerLayer(seq_size, num_heads, seq_size//4))
+        self.att_temporal = []
+        self.att_feature = []
+        self.mean_att_distance_temporal = []
+        total_dim = (hidden_dim//4)*(seq_size//4)
+        self.final_layers = nn.ModuleList()
+        while total_dim > 128:
+            self.final_layers.append(nn.Linear(total_dim, total_dim//4))
+            self.final_layers.append(nn.GELU())
+            total_dim = total_dim//4
+        self.final_layers.append(nn.Linear(total_dim, 3))
+    def forward(self, input, store_att=False):
+        if self.dataset_type == "LOBSTER":
+            continuous_features = torch.cat([input[:, :, :41], input[:, :, 42:]], dim=2)
+            order_type = input[:, :, 41].long()
+            order_type_emb = self.order_type_embedder(order_type).detach()
+            x = torch.cat([continuous_features, order_type_emb], dim=2)
+        else:
+            x = input
+        x = rearrange(x, 'b s f -> b f s')
+        x = self.norm_layer(x)
+        x = rearrange(x, 'b f s -> b s f')
+        x = self.emb_layer(x)
+        x = x[:] + self.pos_encoder
+        mean_att_distance_temporal = np.zeros((self.num_layers, self.num_heads))
+        att_max_temporal = np.zeros((self.num_layers, 2, self.num_heads, self.seq_size))
+        att_max_feature = np.zeros((self.num_layers-1, 2, self.num_heads, self.hidden_dim))
+        att_temporal = np.zeros((self.num_layers, self.num_heads, self.seq_size, self.seq_size))
+        att_feature = np.zeros((self.num_layers-1, self.num_heads, self.hidden_dim, self.hidden_dim))
+        for i in range(len(self.layers)):
+            x, att = self.layers[i](x)
+            att = att.detach()
+            x = x.permute(0, 2, 1)
+            if store_att:
+                if i % 2 == 0:
+                    att_temporal[i//2] = att[0].cpu().numpy()
+                    values, indices = att[0].max(dim=2)
+                    mean_att_distance_temporal[i//2] = compute_mean_att_distance(att[0])
+                    att_max_temporal[i//2, 0] = indices.cpu().numpy()
+                    att_max_temporal[i//2, 1] = values.cpu().numpy()
+                elif i % 2 == 1 and i != len(self.layers)-1:
+                    att_feature[i//2] = att[0].cpu().numpy()
+                    values, indices = att[0].max(dim=2)
+                    att_max_feature[i//2, 0] = indices.cpu().numpy()
+                    att_max_feature[i//2, 1] = values.cpu().numpy()
+        self.mean_att_distance_temporal.append(mean_att_distance_temporal)
+        if store_att:
+            self.att_temporal.append(att_max_temporal)
+            self.att_feature.append(att_max_feature)
+        x = rearrange(x, 'b s f -> b (f s) 1')
+        x = x.reshape(x.shape[0], -1)
+        for layer in self.final_layers:
+            x = layer(x)
+        return x, att_temporal, att_feature
+def sinusoidal_positional_embedding(token_sequence_size, token_embedding_dim, n=10000.0):
+    if token_embedding_dim % 2 != 0:
+        raise ValueError("Sinusoidal positional embedding cannot apply to odd token embedding dim (got dim={:d})".format(token_embedding_dim))
+    T = token_sequence_size
+    d = token_embedding_dim
+    positions = torch.arange(0, T).unsqueeze_(1)
+    embeddings = torch.zeros(T, d)
+    denominators = torch.pow(n, 2*torch.arange(0, d//2)/d) # 10000^(2i/d_model), i is the index of embedding
+    embeddings[:, 0::2] = torch.sin(positions/denominators) # sin(pos/10000^(2i/d_model))
+    embeddings[:, 1::2] = torch.cos(positions/denominators) # cos(pos/10000^(2i/d_model))
+    return embeddings.to(cst.DEVICE, non_blocking=True)
+def count_parameters(layer):
+    print(f"Number of parameters: {sum(p.numel() for p in layer.parameters() if p.requires_grad)}")
+def compute_mean_att_distance(att):
+    att_distances = np.zeros((att.shape[0], att.shape[1]))
+    for h in range(att.shape[0]):
+        for key in range(att.shape[2]):
+            for query in range(att.shape[1]):
+                distance = abs(query-key)
+                att_distances[h, key] += torch.abs(att[h, query, key]).cpu().item()*distance
+    mean_distances = att_distances.mean(axis=1)
+    return mean_distances

preprocessing/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (2.61 kB). View file

preprocessing/__pycache__/dataset.cpython-311.pyc ADDED Viewed

Binary file (4.18 kB). View file

preprocessing/__pycache__/fi_2010.cpython-310.pyc ADDED Viewed

Binary file (1.56 kB). View file

preprocessing/__pycache__/fi_2010.cpython-311.pyc ADDED Viewed

Binary file (3.37 kB). View file

preprocessing/__pycache__/lobster.cpython-310.pyc ADDED Viewed

Binary file (9.2 kB). View file

preprocessing/__pycache__/lobster.cpython-311.pyc ADDED Viewed

Binary file (22.3 kB). View file

preprocessing/dataset.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+from torch.utils import data
+import pytorch_lightning as pl
+from torch.utils.data import DataLoader
+import numpy as np
+import constants as cst
+import time
+from torch.utils import data
+from utils.utils_data import one_hot_encoding_type, tanh_encoding_type
+class Dataset(data.Dataset):
+    """Characterizes a dataset for PyTorch"""
+    def __init__(self, x, y, seq_size):
+        """Initialization"""
+        self.seq_size = seq_size
+        self.length = y.shape[0]
+        self.x = x
+        self.y = y
+        if type(self.x) == np.ndarray:
+            self.x = torch.from_numpy(x).float()
+        if type(self.y) == np.ndarray:
+            self.y = torch.from_numpy(y).long()
+        self.data = self.x
+    def __len__(self):
+        """Denotes the total number of samples"""
+        return self.length
+    def __getitem__(self, i):
+        input = self.x[i:i+self.seq_size, :]
+        return input, self.y[i]
+class DataModule(pl.LightningDataModule):
+    def   __init__(self, train_set, val_set, batch_size, test_batch_size,  is_shuffle_train=True, test_set=None, num_workers=16):
+        super().__init__()
+        self.train_set = train_set
+        self.val_set = val_set
+        self.test_set = test_set
+        self.batch_size = batch_size
+        self.test_batch_size = test_batch_size
+        self.is_shuffle_train = is_shuffle_train
+        if train_set.data.device.type != cst.DEVICE:       #this is true only when we are using a GPU but the data is still on the CPU
+            self.pin_memory = True
+        else:
+            self.pin_memory = False
+        self.num_workers = num_workers
+    def train_dataloader(self):
+        return DataLoader(
+            dataset=self.train_set,
+            batch_size=self.batch_size,
+            shuffle=self.is_shuffle_train,
+            pin_memory=self.pin_memory,
+            drop_last=False,
+            num_workers=self.num_workers,
+            persistent_workers=True
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            dataset=self.val_set,
+            batch_size=self.test_batch_size,
+            shuffle=False,
+            pin_memory=self.pin_memory,
+            drop_last=False,
+            num_workers=self.num_workers,
+            persistent_workers=True
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            dataset=self.test_set,
+            batch_size=self.test_batch_size,
+            shuffle=False,
+            pin_memory=self.pin_memory,
+            drop_last=False,
+            num_workers=self.num_workers,
+            persistent_workers=True
+        )

preprocessing/fi_2010.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import numpy as np
+import constants as cst
+import os
+from torch.utils import data
+import torch
+def fi_2010_load(path, seq_size, horizon, all_features):
+    dec_data = np.loadtxt(path + "/Train_Dst_NoAuction_ZScore_CF_7.txt")
+    full_train = dec_data[:, :int(dec_data.shape[1] * cst.SPLIT_RATES[0])]
+    full_val = dec_data[:, int(dec_data.shape[1] * cst.SPLIT_RATES[0]):]
+    dec_test1 = np.loadtxt(path + '/Test_Dst_NoAuction_ZScore_CF_7.txt')
+    dec_test2 = np.loadtxt(path + '/Test_Dst_NoAuction_ZScore_CF_8.txt')
+    dec_test3 = np.loadtxt(path + '/Test_Dst_NoAuction_ZScore_CF_9.txt')
+    full_test = np.hstack((dec_test1, dec_test2, dec_test3))
+    if horizon == 1:
+        tmp = 5
+    elif horizon == 2:
+        tmp = 4
+    elif horizon == 3:
+        tmp = 3
+    elif horizon == 5:
+        tmp = 2
+    elif horizon == 10:
+        tmp = 1
+    else:
+        raise ValueError("Horizon not found")
+    train_labels = full_train[-tmp, :].flatten()
+    val_labels = full_val[-tmp, :].flatten()
+    test_labels = full_test[-tmp, :].flatten()
+    train_labels = train_labels[seq_size-1:] - 1
+    val_labels = val_labels[seq_size-1:] - 1
+    test_labels = test_labels[seq_size-1:] - 1
+    if all_features:
+        train_input = full_train[:144, :].T
+        val_input = full_val[:144, :].T
+        test_input = full_test[:144, :].T
+    else:
+        train_input = full_train[:40, :].T
+        val_input = full_val[:40, :].T
+        test_input = full_test[:40, :].T
+    train_input = torch.from_numpy(train_input).float()
+    train_labels = torch.from_numpy(train_labels).long()
+    val_input = torch.from_numpy(val_input).float()
+    val_labels = torch.from_numpy(val_labels).long()
+    test_input = torch.from_numpy(test_input).float()
+    test_labels = torch.from_numpy(test_labels).long()
+    return train_input, train_labels, val_input, val_labels, test_input, test_labels

preprocessing/lobster.py ADDED Viewed

	@@ -0,0 +1,324 @@

+import os
+from utils.utils_data import z_score_orderbook, normalize_messages, preprocess_data, one_hot_encoding_type
+import pandas as pd
+import numpy as np
+import torch
+import constants as cst
+from torch.utils import data
+def lobster_load(path, all_features, len_smooth, h, seq_size):
+    set = np.load(path)
+    if h == 10:
+        tmp = 5
+    if h == 20:
+        tmp = 4
+    elif h == 50:
+        tmp = 3
+    elif h == 100:
+        tmp = 2
+    elif h == 200:
+        tmp = 1
+    labels = set[seq_size-len_smooth:, -tmp]
+    labels = labels[np.isfinite(labels)]
+    labels = torch.from_numpy(labels).long()
+    if all_features:
+        input = set[:, cst.LEN_ORDER:cst.LEN_ORDER + 40]
+        orders = set[:, :cst.LEN_ORDER]
+        input = torch.from_numpy(input).float()
+        orders = torch.from_numpy(orders).float()
+        input = torch.cat((input, orders), dim=1)
+    else:
+        input = set[:, cst.LEN_ORDER:cst.LEN_ORDER + 40]
+        input = torch.from_numpy(input).float()
+    return input, labels
+def labeling(X, len, h, stock):
+    # X is the orderbook
+    # len is the time window smoothing length
+    # h is the prediction horizon
+    [N, D] = X.shape
+    if h < len:
+        len = h
+    # Calculate previous and future mid-prices for all relevant indices
+    previous_ask_prices = np.lib.stride_tricks.sliding_window_view(X[:, 0], window_shape=len)[:-h]
+    previous_bid_prices = np.lib.stride_tricks.sliding_window_view(X[:, 2], window_shape=len)[:-h]
+    future_ask_prices = np.lib.stride_tricks.sliding_window_view(X[:, 0], window_shape=len)[h:]
+    future_bid_prices = np.lib.stride_tricks.sliding_window_view(X[:, 2], window_shape=len)[h:]
+    previous_mid_prices = (previous_ask_prices + previous_bid_prices) / 2
+    future_mid_prices = (future_ask_prices + future_bid_prices) / 2
+    previous_mid_prices = np.mean(previous_mid_prices, axis=1)
+    future_mid_prices = np.mean(future_mid_prices, axis=1)
+    # Compute percentage change
+    percentage_change = (future_mid_prices - previous_mid_prices) / previous_mid_prices
+    # alpha is the average percentage change of the stock
+    alpha = np.abs(percentage_change).mean() / 2
+    # alpha is the average spread of the stock in percentage of the mid-price
+    #alpha = (X[:, 0] - X[:, 2]).mean() / ((X[:, 0] + X[:, 2]) / 2).mean()
+    print(f"Alpha: {alpha}")
+    labels = np.where(percentage_change < -alpha, 2, np.where(percentage_change > alpha, 0, 1))
+    print(f"Number of labels: {np.unique(labels, return_counts=True)}")
+    print(f"Percentage of labels: {np.unique(labels, return_counts=True)[1] / labels.shape[0]}")
+    return labels
+class LOBSTERDataBuilder:
+    def __init__(
+        self,
+        stocks,
+        data_dir,
+        date_trading_days,
+        split_rates,
+        sampling_type,
+        sampling_time,
+        sampling_quantity,
+    ):
+        self.n_lob_levels = cst.N_LOB_LEVELS
+        self.data_dir = data_dir
+        self.date_trading_days = date_trading_days
+        self.stocks = stocks
+        self.split_rates = split_rates
+        self.sampling_type = sampling_type
+        self.sampling_time = sampling_time
+        self.sampling_quantity = sampling_quantity
+    def prepare_save_datasets(self):
+        for i in range(len(self.stocks)):
+            stock = self.stocks[i]
+            path = "{}/{}/{}_{}_{}".format(
+                self.data_dir,
+                stock,
+                stock,
+                self.date_trading_days[0],
+                self.date_trading_days[1],
+            )
+            self.dataframes = []
+            self._prepare_dataframes(path, stock)
+            path_where_to_save = "{}/{}".format(
+                self.data_dir,
+                stock,
+            )
+            self.train_input = pd.concat(self.dataframes[0], axis=1).values
+            self.val_input = pd.concat(self.dataframes[1], axis=1).values
+            self.test_input = pd.concat(self.dataframes[2], axis=1).values
+            self.train_set = pd.concat([pd.DataFrame(self.train_input), pd.DataFrame(self.train_labels_horizons)], axis=1).values
+            self.val_set = pd.concat([pd.DataFrame(self.val_input), pd.DataFrame(self.val_labels_horizons)], axis=1).values
+            self.test_set = pd.concat([pd.DataFrame(self.test_input), pd.DataFrame(self.test_labels_horizons)], axis=1).values
+            self._save(path_where_to_save)
+    def _prepare_dataframes(self, path, stock):
+        COLUMNS_NAMES = {"orderbook": ["sell1", "vsell1", "buy1", "vbuy1",
+                                       "sell2", "vsell2", "buy2", "vbuy2",
+                                       "sell3", "vsell3", "buy3", "vbuy3",
+                                       "sell4", "vsell4", "buy4", "vbuy4",
+                                       "sell5", "vsell5", "buy5", "vbuy5",
+                                       "sell6", "vsell6", "buy6", "vbuy6",
+                                       "sell7", "vsell7", "buy7", "vbuy7",
+                                       "sell8", "vsell8", "buy8", "vbuy8",
+                                       "sell9", "vsell9", "buy9", "vbuy9",
+                                       "sell10", "vsell10", "buy10", "vbuy10"],
+                         "message": ["time", "event_type", "order_id", "size", "price", "direction"]}
+        self.num_trading_days = len(os.listdir(path))//2
+        split_days = self._split_days()
+        split_days = [i * 2 for i in split_days]
+        self._create_dataframes_splitted(path, split_days, COLUMNS_NAMES)
+        # divide all the price, both of lob and messages, by 10000, to have dollars as unit
+        for i in range(len(self.dataframes)):
+            self.dataframes[i][0]["price"] = self.dataframes[i][0]["price"] / 10000
+            self.dataframes[i][1].loc[:, ::2] /= 10000
+        train_input = self.dataframes[0][1].values
+        val_input = self.dataframes[1][1].values
+        test_input = self.dataframes[2][1].values
+        #create a dataframe for the labels
+        for i in range(len(cst.LOBSTER_HORIZONS)):
+            if i == 0:
+                train_labels = labeling(train_input, cst.LEN_SMOOTH, cst.LOBSTER_HORIZONS[i], stock)
+                val_labels = labeling(val_input, cst.LEN_SMOOTH, cst.LOBSTER_HORIZONS[i], stock)
+                test_labels = labeling(test_input, cst.LEN_SMOOTH, cst.LOBSTER_HORIZONS[i], stock)
+                train_labels = np.concatenate([train_labels, np.full(shape=(train_input.shape[0] - train_labels.shape[0]), fill_value=np.inf)])
+                val_labels = np.concatenate([val_labels, np.full(shape=(val_input.shape[0] - val_labels.shape[0]), fill_value=np.inf)])
+                test_labels = np.concatenate([test_labels, np.full(shape=(test_input.shape[0] - test_labels.shape[0]), fill_value=np.inf)])
+                self.train_labels_horizons = pd.DataFrame(train_labels, columns=["label_h{}".format(cst.LOBSTER_HORIZONS[i])])
+                self.val_labels_horizons = pd.DataFrame(val_labels, columns=["label_h{}".format(cst.LOBSTER_HORIZONS[i])])
+                self.test_labels_horizons = pd.DataFrame(test_labels, columns=["label_h{}".format(cst.LOBSTER_HORIZONS[i])])
+            else:
+                train_labels = labeling(train_input, cst.LEN_SMOOTH, cst.LOBSTER_HORIZONS[i], stock)
+                val_labels = labeling(val_input, cst.LEN_SMOOTH, cst.LOBSTER_HORIZONS[i], stock)
+                test_labels = labeling(test_input, cst.LEN_SMOOTH, cst.LOBSTER_HORIZONS[i], stock)
+                train_labels = np.concatenate([train_labels, np.full(shape=(train_input.shape[0] - train_labels.shape[0]), fill_value=np.inf)])
+                val_labels = np.concatenate([val_labels, np.full(shape=(val_input.shape[0] - val_labels.shape[0]), fill_value=np.inf)])
+                test_labels = np.concatenate([test_labels, np.full(shape=(test_input.shape[0] - test_labels.shape[0]), fill_value=np.inf)])
+                self.train_labels_horizons["label_h{}".format(cst.LOBSTER_HORIZONS[i])] = train_labels
+                self.val_labels_horizons["label_h{}".format(cst.LOBSTER_HORIZONS[i])] = val_labels
+                self.test_labels_horizons["label_h{}".format(cst.LOBSTER_HORIZONS[i])] = test_labels
+        #self._sparse_representation()
+        # to conclude the preprocessing we normalize the dataframes
+        self._normalize_dataframes()
+    def _sparse_representation(self):
+        tick_size = 0.01
+        for i in range(len(self.dataframes)):
+            dense_repr = self.dataframes[i][1].values
+            sparse_repr = np.zeros((dense_repr.shape[0], dense_repr.shape[1] + 1))
+            for row in range(dense_repr.shape[0]):
+                sparse_pos_ask = 0
+                sparse_pos_bid = 0
+                mid_price = (dense_repr[row][0] + dense_repr[row][2]) / 2
+                sparse_repr[row][-1] = mid_price
+                for col in range(0, dense_repr.shape[1], 2):
+                    if col == 0:
+                        start_ask = dense_repr[row][col]
+                    elif col == 2:
+                        start_bid = dense_repr[row][col]
+                    elif col % 4 == 0:
+                        if sparse_pos_ask < (sparse_repr.shape[1]) - 1 / 2:
+                            actual_ask = dense_repr[row][col]
+                            for level in range(0, actual_ask-start_ask, -tick_size):
+                                if sparse_pos_ask < (sparse_repr.shape[1]) - 1 / 2:
+                                    if level == actual_ask - start_ask - tick_size:
+                                        sparse_repr[row][sparse_pos_ask] = dense_repr[row][col+1]
+                                    else:
+                                        sparse_repr[row][sparse_pos_ask] = 0
+                                    sparse_pos_ask += 1
+                                else:
+                                    break
+                            start_ask = actual_ask
+                        else:
+                            continue
+                    elif col % 4 == 2:
+                        if sparse_pos_bid < (sparse_repr.shape[1]) - 1 / 2:
+                            actual_bid = dense_repr[row][col]
+                            for level in range(0, start_bid-actual_bid, -tick_size):
+                                if sparse_pos_bid < (sparse_repr.shape[1]) - 1 / 2:
+                                    if level == start_bid - actual_bid - tick_size:
+                                        sparse_repr[row][sparse_pos_ask] = dense_repr[row][col+1]
+                                    else:
+                                        sparse_repr[row][sparse_pos_ask] = 0
+                                    sparse_pos_bid += 1
+                                else:
+                                    break
+                            start_bid = actual_bid
+                        else:
+                            continue
+    def _create_dataframes_splitted(self, path, split_days, COLUMNS_NAMES):
+        # iterate over files in the data directory of self.STOCK_NAME
+        total_shape = 0
+        for i, filename in enumerate(sorted(os.listdir(path))):
+            f = os.path.join(path, filename)
+            print(f)
+            if os.path.isfile(f):
+                # then we create the df for the training set
+                if i < split_days[0]:
+                    if (i % 2) == 0:
+                        if i == 0:
+                            train_messages = pd.read_csv(f, names=COLUMNS_NAMES["message"])
+                        else:
+                            train_message = pd.read_csv(f, names=COLUMNS_NAMES["message"])
+                    else:
+                        if i == 1:
+                            train_orderbooks = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
+                            total_shape += train_orderbooks.shape[0]
+                            train_orderbooks, train_messages = preprocess_data([train_messages, train_orderbooks], self.n_lob_levels, self.sampling_type, self.sampling_time, self.sampling_quantity)
+                            if (len(train_orderbooks) != len(train_messages)):
+                                raise ValueError("train_orderbook length is different than train_messages")
+                        else:
+                            train_orderbook = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
+                            total_shape += train_orderbook.shape[0]
+                            train_orderbook, train_message = preprocess_data([train_message, train_orderbook], self.n_lob_levels, self.sampling_type, self.sampling_time, self.sampling_quantity)
+                            train_messages = pd.concat([train_messages, train_message], axis=0)
+                            train_orderbooks = pd.concat([train_orderbooks, train_orderbook], axis=0)
+                elif split_days[0] <= i < split_days[1]:  # then we are creating the df for the validation set
+                    if (i % 2) == 0:
+                        if (i == split_days[0]):
+                            self.dataframes.append([train_messages, train_orderbooks])
+                            val_messages = pd.read_csv(f, names=COLUMNS_NAMES["message"])
+                        else:
+                            val_message = pd.read_csv(f, names=COLUMNS_NAMES["message"])
+                    else:
+                        if i == split_days[0] + 1:
+                            val_orderbooks = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
+                            total_shape += val_orderbooks.shape[0]
+                            val_orderbooks, val_messages = preprocess_data([val_messages, val_orderbooks], self.n_lob_levels, self.sampling_type, self.sampling_time, self.sampling_quantity)
+                            if (len(val_orderbooks) != len(val_messages)):
+                                raise ValueError("val_orderbook length is different than val_messages")
+                        else:
+                            val_orderbook = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
+                            total_shape += val_orderbook.shape[0]
+                            val_orderbook, val_message = preprocess_data([val_message, val_orderbook], self.n_lob_levels, self.sampling_type, self.sampling_time, self.sampling_quantity)
+                            val_messages = pd.concat([val_messages, val_message], axis=0)
+                            val_orderbooks = pd.concat([val_orderbooks, val_orderbook], axis=0)
+                else:  # then we are creating the df for the test set
+                    if (i % 2) == 0:
+                        if (i == split_days[1]):
+                            self.dataframes.append([val_messages, val_orderbooks])
+                            test_messages = pd.read_csv(f, names=COLUMNS_NAMES["message"])
+                        else:
+                            test_message = pd.read_csv(f, names=COLUMNS_NAMES["message"])
+                    else:
+                        if i == split_days[1] + 1:
+                            test_orderbooks = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
+                            test_orderbooks, test_messages = preprocess_data([test_messages, test_orderbooks], self.n_lob_levels, self.sampling_type, self.sampling_time, self.sampling_quantity)
+                            if (len(test_orderbooks) != len(test_messages)):
+                                raise ValueError("test_orderbook length is different than test_messages")
+                        else:
+                            test_orderbook = pd.read_csv(f, names=COLUMNS_NAMES["orderbook"])
+                            test_orderbook, test_message = preprocess_data([test_message, test_orderbook], self.n_lob_levels, self.sampling_type, self.sampling_time, self.sampling_quantity)
+                            test_messages = pd.concat([test_messages, test_message], axis=0)
+                            test_orderbooks = pd.concat([test_orderbooks, test_orderbook], axis=0)
+            else:
+                raise ValueError("File {} is not a file".format(f))
+        self.dataframes.append([test_messages, test_orderbooks])
+        print(f"Total shape of the orderbooks is {total_shape}")
+    def _normalize_dataframes(self):
+        #apply z score to orderbooks
+        for i in range(len(self.dataframes)):
+            if (i == 0):
+                self.dataframes[i][1], mean_size, mean_prices, std_size, std_prices = z_score_orderbook(self.dataframes[i][1])
+            else:
+                self.dataframes[i][1], _, _, _, _ = z_score_orderbook(self.dataframes[i][1], mean_size, mean_prices, std_size, std_prices)
+        #apply z-score to size and prices of messages with the statistics of the train set
+        for i in range(len(self.dataframes)):
+            if (i == 0):
+                self.dataframes[i][0], mean_size, mean_prices, std_size, std_prices, mean_time, std_time, mean_depth, std_depth = normalize_messages(self.dataframes[i][0])
+            else:
+                self.dataframes[i][0], _, _, _, _, _, _, _, _ = normalize_messages(self.dataframes[i][0], mean_size, mean_prices, std_size, std_prices, mean_time, std_time, mean_depth, std_depth)
+    def _save(self, path_where_to_save):
+        np.save(path_where_to_save + "/train.npy", self.train_set)
+        np.save(path_where_to_save + "/val.npy", self.val_set)
+        np.save(path_where_to_save + "/test.npy", self.test_set)
+    def _split_days(self):
+        train = int(self.num_trading_days * self.split_rates[0])
+        val = int(self.num_trading_days * self.split_rates[1]) + train
+        test = int(self.num_trading_days * self.split_rates[2]) + val
+        print(f"There are {train} days for training, {val - train} days for validation and {test - val} days for testing")
+        return [train, val, test]

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+einops
+hydra-core
+lightning
+lion_pytorch
+matplotlib
+numpy
+omegaconf
+pandas
+pytorch_lightning
+Requests
+scikit_learn
+scipy
+seaborn
+torch
+torch_ema
+torchvision
+transformers
+wandb

run.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import lightning as L
+import omegaconf
+import torch
+from lightning.pytorch.loggers import WandbLogger
+import wandb
+from torch.utils.data import DataLoader
+from lightning.pytorch.callbacks import TQDMProgressBar
+from lightning.pytorch.callbacks.early_stopping import EarlyStopping
+from config.config import Config
+from models.engine import Engine
+from preprocessing.fi_2010 import fi_2010_load
+from preprocessing.lobster import lobster_load
+from preprocessing.dataset import Dataset, DataModule
+import constants as cst
+def run(config: Config, accelerator, model=None):
+    run_name = ""
+    for param in config.model.keys():
+        value = config.model[param]
+        if param == "hyperparameters_sweep":
+            continue
+        if type(value) == omegaconf.dictconfig.DictConfig:
+            for key in value.keys():
+                run_name += str(key[:2]) + "_" + str(value[key]) + "_"
+        else:
+            run_name += str(param[:2]) + "_" + str(value.value) + "_"
+    run_name += f"seed_{config.experiment.seed}"
+    seq_size = config.model.hyperparameters_fixed["seq_size"]
+    horizon = config.experiment.horizon
+    training_stocks = config.experiment.training_stocks
+    dataset = config.experiment.dataset_type.value
+    if dataset == "LOBSTER":
+        config.experiment.filename_ckpt = f"{dataset}_{training_stocks}_seq_size_{seq_size}_horizon_{horizon}_{run_name}"
+    else:
+        config.experiment.filename_ckpt = f"{dataset}_seq_size_{seq_size}_horizon_{horizon}_{run_name}"
+    run_name = config.experiment.filename_ckpt
+    trainer = L.Trainer(
+        accelerator=accelerator,
+        precision=cst.PRECISION,
+        max_epochs=config.experiment.max_epochs,
+        callbacks=[
+            EarlyStopping(monitor="val_loss", mode="min", patience=2, verbose=True, min_delta=0.002),
+            TQDMProgressBar(refresh_rate=100)
+            ],
+        num_sanity_val_steps=0,
+        detect_anomaly=False,
+        profiler=None,
+        check_val_every_n_epoch=1
+    )
+    train(config, trainer)
+def train(config: Config, trainer: L.Trainer, run=None):
+    print_setup(config)
+    dataset_type = config.experiment.dataset_type.value
+    seq_size = config.model.hyperparameters_fixed["seq_size"]
+    horizon = config.experiment.horizon
+    model_type = config.model.type
+    training_stocks = config.experiment.training_stocks
+    testing_stocks = config.experiment.testing_stocks
+    dataset_type = config.experiment.dataset_type.value
+    if dataset_type == "FI-2010":
+        path = cst.DATA_DIR + "/FI_2010"
+        train_input, train_labels, val_input, val_labels, test_input, test_labels = fi_2010_load(path, seq_size, horizon, config.model.hyperparameters_fixed["all_features"])
+        data_module = DataModule(
+            train_set=Dataset(train_input, train_labels, seq_size),
+            val_set=Dataset(val_input, val_labels, seq_size),
+            test_set=Dataset(test_input, test_labels, seq_size),
+            batch_size=config.experiment.batch_size,
+            test_batch_size=config.experiment.batch_size*4,
+            num_workers=4
+        )
+        test_loaders = [data_module.test_dataloader()]
+    else:
+        for i in range(len(training_stocks)):
+            if i == 0:
+                for j in range(2):
+                    if j == 0:
+                        path = cst.DATA_DIR + "/" + training_stocks[i] + "/train.npy"
+                        train_input, train_labels = lobster_load(path, config.model.hyperparameters_fixed["all_features"], cst.LEN_SMOOTH, horizon, seq_size)
+                    if j == 1:
+                        path = cst.DATA_DIR + "/" + training_stocks[i] + "/val.npy"
+                        val_input, val_labels = lobster_load(path, config.model.hyperparameters_fixed["all_features"], cst.LEN_SMOOTH, horizon, seq_size)
+            else:
+                for j in range(2):
+                    if j == 0:
+                        path = cst.DATA_DIR + "/" + training_stocks[i] + "/train.npy"
+                        train_labels = torch.cat((train_labels, torch.zeros(seq_size+horizon-1, dtype=torch.long)), 0)
+                        train_input_tmp, train_labels_tmp = lobster_load(path, config.model.hyperparameters_fixed["all_features"], cst.LEN_SMOOTH, horizon, seq_size)
+                        train_input = torch.cat((train_input, train_input_tmp), 0)
+                        train_labels = torch.cat((train_labels, train_labels_tmp), 0)
+                    if j == 1:
+                        path = cst.DATA_DIR + "/" + training_stocks[i] + "/val.npy"
+                        val_labels = torch.cat((val_labels, torch.zeros(seq_size+horizon-1, dtype=torch.long)), 0)
+                        val_input_tmp, val_labels_tmp = lobster_load(path, config.model.hyperparameters_fixed["all_features"], cst.LEN_SMOOTH, horizon, seq_size)
+                        val_input = torch.cat((val_input, val_input_tmp), 0)
+                        val_labels = torch.cat((val_labels, val_labels_tmp), 0)
+        test_loaders = []
+        for i in range(len(testing_stocks)):
+            path = cst.DATA_DIR + "/" + testing_stocks[i] + "/test.npy"
+            test_input, test_labels = lobster_load(path, config.model.hyperparameters_fixed["all_features"], cst.LEN_SMOOTH, horizon, seq_size)
+            test_set = Dataset(test_input, test_labels, seq_size)
+            test_dataloader = DataLoader(
+            dataset=test_set,
+            batch_size=config.experiment.batch_size*4,
+            shuffle=False,
+            pin_memory=True,
+            drop_last=False,
+            num_workers=4,
+            persistent_workers=True
+        )
+            test_loaders.append(test_dataloader)
+            train_set = Dataset(train_input, train_labels, seq_size)
+        val_set = Dataset(val_input, val_labels, seq_size)
+        counts_train = torch.unique(train_labels, return_counts=True)
+        counts_val = torch.unique(val_labels, return_counts=True)
+        print("Train set shape: ", train_input.shape)
+        print("Val set shape: ", val_input.shape)
+        print("Classes counts in train set: ", counts_train[1])
+        print("Classes counts in val set: ", counts_val[1])
+        print(f"Classes distribution in train set: up {counts_train[1][0]/train_labels.shape[0]} stat {counts_train[1][1]/train_labels.shape[0]} down {counts_train[1][2]/train_labels.shape[0]} ", )
+        print(f"Classes distribution in val set: up {counts_val[1][0]/val_labels.shape[0]} stat {counts_val[1][1]/val_labels.shape[0]} down {counts_val[1][2]/val_labels.shape[0]} ", )
+        data_module = DataModule(
+            train_set=train_set,
+            val_set=val_set,
+            batch_size=config.experiment.batch_size,
+            test_batch_size=config.experiment.batch_size*4,
+            num_workers=4
+        )
+    experiment_type = config.experiment.type
+    if "FINETUNING" in experiment_type or "EVALUATION" in experiment_type:
+        checkpoint = torch.load(config.experiment.checkpoint_reference, map_location=cst.DEVICE)
+        print("Loading model from checkpoint: ", config.experiment.checkpoint_reference)
+        lr = checkpoint["hyper_parameters"]["lr"]
+        filename_ckpt = checkpoint["hyper_parameters"]["filename_ckpt"]
+        hidden_dim = checkpoint["hyper_parameters"]["hidden_dim"]
+        num_layers = checkpoint["hyper_parameters"]["num_layers"]
+        optimizer = checkpoint["hyper_parameters"]["optimizer"]
+        model_type = checkpoint["hyper_parameters"]["model_type"]#.value
+        max_epochs = checkpoint["hyper_parameters"]["max_epochs"]
+        horizon = checkpoint["hyper_parameters"]["horizon"]
+        seq_size = checkpoint["hyper_parameters"]["seq_size"]
+        if model_type == "MLPLOB":
+            model = Engine.load_from_checkpoint(
+                config.experiment.checkpoint_reference,
+                seq_size=seq_size,
+                horizon=horizon,
+                max_epochs=max_epochs,
+                model_type=model_type,
+                is_wandb=config.experiment.is_wandb,
+                experiment_type=experiment_type,
+                lr=lr,
+                optimizer=optimizer,
+                filename_ckpt=filename_ckpt,
+                hidden_dim=hidden_dim,
+                num_layers=num_layers,
+                num_features=train_input.shape[1],
+                dataset_type=dataset_type,
+                map_location=cst.DEVICE,
+                )
+        elif model_type == "TLOB":
+            model = Engine.load_from_checkpoint(
+                config.experiment.checkpoint_reference,
+                seq_size=seq_size,
+                horizon=horizon,
+                max_epochs=max_epochs,
+                model_type=model_type,
+                is_wandb=config.experiment.is_wandb,
+                experiment_type=experiment_type,
+                lr=lr,
+                optimizer=optimizer,
+                filename_ckpt=filename_ckpt,
+                hidden_dim=hidden_dim,
+                num_layers=num_layers,
+                num_features=train_input.shape[1],
+                dataset_type=dataset_type,
+                num_heads=checkpoint["hyper_parameters"]["num_heads"],
+                is_sin_emb=checkpoint["hyper_parameters"]["is_sin_emb"],
+                map_location=cst.DEVICE,
+                len_test_dataloader=len(test_loaders[0])
+                )
+        elif model_type == "BINCTABL":
+            model = Engine.load_from_checkpoint(
+                config.experiment.checkpoint_reference,
+                seq_size=seq_size,
+                horizon=horizon,
+                max_epochs=max_epochs,
+                model_type=model_type,
+                is_wandb=config.experiment.is_wandb,
+                experiment_type=experiment_type,
+                lr=lr,
+                optimizer=optimizer,
+                filename_ckpt=filename_ckpt,
+                num_features=train_input.shape[1],
+                dataset_type=dataset_type,
+                map_location=cst.DEVICE,
+                len_test_dataloader=len(test_loaders[0])
+                )
+        elif model_type == "DEEPLOB":
+            model = Engine.load_from_checkpoint(
+                config.experiment.checkpoint_reference,
+                seq_size=seq_size,
+                horizon=horizon,
+                max_epochs=max_epochs,
+                model_type=model_type,
+                is_wandb=config.experiment.is_wandb,
+                experiment_type=experiment_type,
+                lr=lr,
+                optimizer=optimizer,
+                filename_ckpt=filename_ckpt,
+                num_features=train_input.shape[1],
+                dataset_type=dataset_type,
+                map_location=cst.DEVICE,
+                len_test_dataloader=len(test_loaders[0])
+                )
+    else:
+        if model_type == cst.ModelType.MLPLOB:
+            model = Engine(
+                seq_size=seq_size,
+                horizon=horizon,
+                max_epochs=config.experiment.max_epochs,
+                model_type=config.model.type.value,
+                is_wandb=config.experiment.is_wandb,
+                experiment_type=experiment_type,
+                lr=config.model.hyperparameters_fixed["lr"],
+                optimizer=config.experiment.optimizer,
+                filename_ckpt=config.experiment.filename_ckpt,
+                hidden_dim=config.model.hyperparameters_fixed["hidden_dim"],
+                num_layers=config.model.hyperparameters_fixed["num_layers"],
+                num_features=train_input.shape[1],
+                dataset_type=dataset_type,
+                len_test_dataloader=len(test_loaders[0])
+            )
+        elif model_type == cst.ModelType.TLOB:
+            model = Engine(
+                seq_size=seq_size,
+                horizon=horizon,
+                max_epochs=config.experiment.max_epochs,
+                model_type=config.model.type.value,
+                is_wandb=config.experiment.is_wandb,
+                experiment_type=experiment_type,
+                lr=config.model.hyperparameters_fixed["lr"],
+                optimizer=config.experiment.optimizer,
+                filename_ckpt=config.experiment.filename_ckpt,
+                hidden_dim=config.model.hyperparameters_fixed["hidden_dim"],
+                num_layers=config.model.hyperparameters_fixed["num_layers"],
+                num_features=train_input.shape[1],
+                dataset_type=dataset_type,
+                num_heads=config.model.hyperparameters_fixed["num_heads"],
+                is_sin_emb=config.model.hyperparameters_fixed["is_sin_emb"],
+                len_test_dataloader=len(test_loaders[0])
+            )
+        elif model_type == cst.ModelType.BINCTABL:
+            model = Engine(
+                seq_size=seq_size,
+                horizon=horizon,
+                max_epochs=config.experiment.max_epochs,
+                model_type=config.model.type.value,
+                is_wandb=config.experiment.is_wandb,
+                experiment_type=experiment_type,
+                lr=config.model.hyperparameters_fixed["lr"],
+                optimizer=config.experiment.optimizer,
+                filename_ckpt=config.experiment.filename_ckpt,
+                num_features=train_input.shape[1],
+                dataset_type=dataset_type,
+                len_test_dataloader=len(test_loaders[0])
+            )
+        elif model_type == cst.ModelType.DEEPLOB:
+            model = Engine(
+                seq_size=seq_size,
+                horizon=horizon,
+                max_epochs=config.experiment.max_epochs,
+                model_type=config.model.type.value,
+                is_wandb=config.experiment.is_wandb,
+                experiment_type=experiment_type,
+                lr=config.model.hyperparameters_fixed["lr"],
+                optimizer=config.experiment.optimizer,
+                filename_ckpt=config.experiment.filename_ckpt,
+                num_features=train_input.shape[1],
+                dataset_type=dataset_type,
+                len_test_dataloader=len(test_loaders[0])
+            )
+    print("total number of parameters: ", sum(p.numel() for p in model.parameters()))
+    train_dataloader, val_dataloader = data_module.train_dataloader(), data_module.val_dataloader()
+    if "TRAINING" in experiment_type or "FINETUNING" in experiment_type:
+        trainer.fit(model, train_dataloader, val_dataloader)
+        best_model_path = model.last_path_ckpt
+        print("Best model path: ", best_model_path)
+        try:
+            best_model = Engine.load_from_checkpoint(best_model_path, map_location=cst.DEVICE)
+        except:
+            print("no checkpoints has been saved, selecting the last model")
+            best_model = model
+        best_model.experiment_type = "EVALUATION"
+        for i in range(len(test_loaders)):
+            test_dataloader = test_loaders[i]
+            output = trainer.test(best_model, test_dataloader)
+            if run is not None and dataset_type == "LOBSTER":
+                run.log({f"f1 {testing_stocks[i]} best": output[0]["f1_score"]}, commit=False)
+            elif run is not None and dataset_type == "FI-2010":
+                run.log({f"f1 FI-2010 ": output[0]["f1_score"]}, commit=False)
+    else:
+        for i in range(len(test_loaders)):
+            test_dataloader = test_loaders[i]
+            output = trainer.test(model, test_dataloader)
+            if run is not None and dataset_type == "LOBSTER":
+                run.log({f"f1 {testing_stocks[i]} best": output[0]["f1_score"]}, commit=False)
+            elif run is not None and dataset_type == "FI-2010":
+                run.log({f"f1 FI-2010 ": output[0]["f1_score"]}, commit=False)
+def run_wandb(config: Config, accelerator):
+    def wandb_sweep_callback():
+        wandb_logger = WandbLogger(project=cst.PROJECT_NAME, log_model=False, save_dir=cst.DIR_SAVED_MODEL)
+        run_name = None
+        if not config.experiment.is_sweep:
+            run_name = ""
+            for param in config.model.keys():
+                value = config.model[param]
+                if param == "hyperparameters_sweep":
+                    continue
+                if type(value) == omegaconf.dictconfig.DictConfig:
+                    for key in value.keys():
+                        run_name += str(key[:2]) + "_" + str(value[key]) + "_"
+                else:
+                    run_name += str(param[:2]) + "_" + str(value.value) + "_"
+        run = wandb.init(project=cst.PROJECT_NAME, name=run_name, entity="") # set entity to your wandb username
+        if config.experiment.is_sweep:
+            model_params = run.config
+        else:
+            model_params = config.model.hyperparameters_fixed
+        wandb_instance_name = ""
+        for param in config.model.hyperparameters_fixed.keys():
+            if param in model_params:
+                config.model.hyperparameters_fixed[param] = model_params[param]
+                wandb_instance_name += str(param) + "_" + str(model_params[param]) + "_"
+        #wandb_instance_name += f"seed_{cst.SEED}"
+        run.name = wandb_instance_name
+        seq_size = config.model.hyperparameters_fixed["seq_size"]
+        horizon = config.experiment.horizon
+        dataset = config.experiment.dataset_type.value
+        training_stocks = config.experiment.training_stocks
+        if dataset == "LOBSTER":
+            config.experiment.filename_ckpt = f"{dataset}_{training_stocks}_seq_size_{seq_size}_horizon_{horizon}_{run_name}"
+        else:
+            config.experiment.filename_ckpt = f"{dataset}_seq_size_{seq_size}_horizon_{horizon}_{run_name}"
+        wandb_instance_name = config.experiment.filename_ckpt
+        trainer = L.Trainer(
+            accelerator=accelerator,
+            precision=cst.PRECISION,
+            max_epochs=config.experiment.max_epochs,
+            callbacks=[
+                EarlyStopping(monitor="val_loss", mode="min", patience=2, verbose=True, min_delta=0.002),
+                TQDMProgressBar(refresh_rate=1000)
+            ],
+            num_sanity_val_steps=0,
+            logger=wandb_logger,
+            detect_anomaly=False,
+            check_val_every_n_epoch=1,
+        )
+        # log simulation details in WANDB console
+        run.log({"model": config.model.type.value}, commit=False)
+        run.log({"dataset": config.experiment.dataset_type.value}, commit=False)
+        run.log({"seed": config.experiment.seed}, commit=False)
+        run.log({"all_features": config.model.hyperparameters_fixed["all_features"]}, commit=False)
+        if config.experiment.dataset_type == cst.Dataset.LOBSTER:
+            for i in range(len(config.experiment.training_stocks)):
+                run.log({f"training stock{i}": config.experiment.training_stocks[i]}, commit=False)
+            for i in range(len(config.experiment.testing_stocks)):
+                run.log({f"testing stock{i}": config.experiment.testing_stocks[i]}, commit=False)
+            run.log({"sampling_type": config.experiment.sampling_type}, commit=False)
+            if config.experiment.sampling_type == "time":
+                run.log({"sampling_time": config.experiment.sampling_time}, commit=False)
+            else:
+                run.log({"sampling_quantity": config.experiment.sampling_quantity}, commit=False)
+        train(config, trainer, run)
+        run.finish()
+    return wandb_sweep_callback
+def sweep_init(config: Config):
+    # put your wandb key here
+    wandb.login()
+    parameters = {}
+    for key in config.model.hyperparameters_sweep.keys():
+        parameters[key] = {'values': list(config.model.hyperparameters_sweep[key])}
+    sweep_config = {
+        'method': 'grid',
+        'metric': {
+            'goal': 'minimize',
+            'name': 'val_loss'
+        },
+        'early_terminate': {
+            'type': 'hyperband',
+            'min_iter': 3,
+            'eta': 1.5
+        },
+        'run_cap': 100,
+        'parameters': {**parameters}
+    }
+    return sweep_config
+def print_setup(config: Config):
+    print("Model type: ", config.model.type)
+    print("Dataset: ", config.experiment.dataset_type)
+    print("Seed: ", config.experiment.seed)
+    print("Sequence size: ", config.model.hyperparameters_fixed["seq_size"])
+    print("Horizon: ", config.experiment.horizon)
+    print("All features: ", config.model.hyperparameters_fixed["all_features"])
+    print("Is data preprocessed: ", config.experiment.is_data_preprocessed)
+    print("Is wandb: ", config.experiment.is_wandb)
+    print("Is sweep: ", config.experiment.is_sweep)
+    print(config.experiment.type)
+    print("Is debug: ", config.experiment.is_debug)
+    if config.experiment.dataset_type == cst.Dataset.LOBSTER:
+        print("Training stocks: ", config.experiment.training_stocks)
+        print("Testing stocks: ", config.experiment.testing_stocks)

tslaintc.png ADDED Viewed

utils/__pycache__/utils_data.cpython-311.pyc ADDED Viewed

Binary file (12.5 kB). View file

utils/__pycache__/utils_model.cpython-310.pyc ADDED Viewed

Binary file (827 Bytes). View file

utils/__pycache__/utils_model.cpython-311.pyc ADDED Viewed

Binary file (1.2 kB). View file

utils/utils_data.py ADDED Viewed

	@@ -0,0 +1,238 @@

+import pandas as pd
+import numpy as np
+import os
+import torch
+import pandas
+import constants as cst
+def z_score_orderbook(data, mean_size=None, mean_prices=None, std_size=None, std_prices=None):
+    """ DONE: remember to use the mean/std of the training set, to z-normalize the test set. """
+    if (mean_size is None) or (std_size is None):
+        mean_size = data.iloc[:, 1::2].stack().mean()
+        std_size = data.iloc[:, 1::2].stack().std()
+    #do the same thing for prices
+    if (mean_prices is None) or (std_prices is None):
+        mean_prices = data.iloc[:, 0::2].stack().mean() #price
+        std_prices = data.iloc[:, 0::2].stack().std() #price
+    # apply the z score to the original data using .loc with explicit float cast
+    price_cols = data.columns[0::2]
+    size_cols = data.columns[1::2]
+    #apply the z score to the original data
+    for col in size_cols:
+        data[col] = data[col].astype("float64")
+        data[col] = (data[col] - mean_size) / std_size
+    for col in price_cols:
+        data[col] = data[col].astype("float64")
+        data[col] = (data[col] - mean_prices) / std_prices
+    # check if there are null values, then raise value error
+    if data.isnull().values.any():
+        raise ValueError("data contains null value")
+    return data, mean_size, mean_prices, std_size,  std_prices
+def normalize_messages(data, mean_size=None, mean_prices=None, std_size=None,  std_prices=None, mean_time=None, std_time=None, mean_depth=None, std_depth=None):
+    #apply z score to prices and size column
+    if (mean_size is None) or (std_size is None):
+        mean_size = data["size"].mean()
+        std_size = data["size"].std()
+    if (mean_prices is None) or (std_prices is None):
+        mean_prices = data["price"].mean()
+        std_prices = data["price"].std()
+    if (mean_time is None) or (std_time is None):
+        mean_time = data["time"].mean()
+        std_time = data["time"].std()
+    if (mean_depth is None) or (std_depth is None):
+        mean_depth = data["depth"].mean()
+        std_depth = data["depth"].std()
+    #apply the z score to the original data
+    data["time"] = (data["time"] - mean_time) / std_time
+    data["size"] = (data["size"] - mean_size) / std_size
+    data["price"] = (data["price"] - mean_prices) / std_prices
+    data["depth"] = (data["depth"] - mean_depth) / std_depth
+    # check if there are null values, then raise value error
+    if data.isnull().values.any():
+        raise ValueError("data contains null value")
+    data["event_type"] = data["event_type"]-1.0
+    data["event_type"] = data["event_type"].replace(2, 1)
+    data["event_type"] = data["event_type"].replace(3, 2)
+    # order_type = 0 -> limit order
+    # order_type = 1 -> cancel order
+    # order_type = 2 -> market order
+    return data, mean_size, mean_prices, std_size,  std_prices, mean_time, std_time, mean_depth, std_depth
+def reset_indexes(dataframes):
+    # reset the indexes of the messages and orderbooks
+    dataframes[0] = dataframes[0].reset_index(drop=True)
+    dataframes[1] = dataframes[1].reset_index(drop=True)
+    return dataframes
+def sampling_quantity(dataframes, quantity=1000):
+    messages_df, orderbook_df = dataframes[0], dataframes[1]
+    # Calculate cumulative sum and create boolean mask
+    cumsum = messages_df['size'].cumsum()
+    sample_mask = (cumsum % quantity < messages_df['size'])
+    # Get indices where we need to sample
+    sampled_indices = messages_df.index[sample_mask].tolist()
+    # Update both dataframes efficiently using loc
+    messages_df = messages_df.loc[sampled_indices].reset_index(drop=True)
+    orderbook_df = orderbook_df.loc[sampled_indices].reset_index(drop=True)
+    return [messages_df, orderbook_df]
+def sampling_time(dataframes, time):
+    # Convert the time column to datetime format if it's not already
+    dataframes[0]['time'] = pd.to_datetime(dataframes[0]['time'], unit='s')
+    # Resample the messages dataframe to get data at every second
+    resampled_messages = dataframes[0].set_index('time').resample(time).first().dropna().reset_index()
+    # Resample the orderbook dataframe to get data at every second
+    resampled_orderbook = dataframes[1].set_index(dataframes[0]['time']).resample(time).first().dropna().reset_index(drop=True)
+    # Update the dataframes with the resampled data
+    dataframes[0] = resampled_messages
+    # Transform the time column to seconds
+    dataframes[0]['time'] = dataframes[0]['time'].dt.second + dataframes[0]['time'].dt.minute * 60 + dataframes[0]['time'].dt.hour * 3600 + dataframes[0]['time'].dt.microsecond / 1e6
+    dataframes[1] = resampled_orderbook
+    return dataframes
+def preprocess_data(dataframes, n_lob_levels, sampling_type, time=None, quantity=None):
+    dataframes = reset_indexes(dataframes)
+    # take only the first n_lob_levels levels of the orderbook and drop the others
+    dataframes[1] = dataframes[1].iloc[:, :n_lob_levels * cst.LEN_LEVEL]
+    # take the indexes of the dataframes that are of type
+    # 2 (partial deletion), 5 (execution of a hidden limit order),
+    # 6 (cross trade), 7 (trading halt) and drop it
+    indexes_to_drop = dataframes[0][dataframes[0]["event_type"].isin([2, 5, 6, 7])].index
+    dataframes[0] = dataframes[0].drop(indexes_to_drop)
+    dataframes[1] = dataframes[1].drop(indexes_to_drop)
+    dataframes = reset_indexes(dataframes)
+    # sample the dataframes according to the sampling type
+    if sampling_type == "time":
+        dataframes = sampling_time(dataframes, time)
+    elif sampling_type == "quantity":
+        dataframes = sampling_quantity(dataframes, quantity)
+    dataframes = reset_indexes(dataframes)
+    # drop index column in messages
+    dataframes[0] = dataframes[0].drop(columns=["order_id"])
+    # do the difference of time row per row in messages and subsitute the values with the differences
+    # Store the initial value of the "time" column
+    first_time = dataframes[0]["time"].values[0]
+    # Calculate the difference using diff
+    dataframes[0]["time"] = dataframes[0]["time"].diff()
+    # Set the first value directly
+    dataframes[0].iat[0, dataframes[0].columns.get_loc("time")] = first_time - 34200
+    # add depth column to messages
+    dataframes[0]["depth"] = 0
+    # we compute the depth of the orders with respect to the orderbook
+    # Extract necessary columns
+    prices = dataframes[0]["price"].values
+    directions = dataframes[0]["direction"].values
+    event_types = dataframes[0]["event_type"].values
+    bid_sides = dataframes[1].iloc[:, 2::4].values
+    ask_sides = dataframes[1].iloc[:, 0::4].values
+    # Initialize depth array
+    depths = np.zeros(dataframes[0].shape[0], dtype=int)
+    # Compute the depth of the orders with respect to the orderbook
+    for j in range(1, len(prices)):
+        order_price = prices[j]
+        direction = directions[j]
+        event_type = event_types[j]
+        index = j if event_type == 1 else j - 1
+        if direction == 1:
+            bid_price = bid_sides[index, 0]
+            depth = (bid_price - order_price) // 100
+        else:
+            ask_price = ask_sides[index, 0]
+            depth = (order_price - ask_price) // 100
+        depths[j] = max(depth, 0)
+    # Assign the computed depths back to the DataFrame
+    dataframes[0]["depth"] = depths
+    # we eliminate the first row of every dataframe because we can't deduce the depth
+    dataframes[0] = dataframes[0].iloc[1:, :]
+    dataframes[1] = dataframes[1].iloc[1:, :]
+    dataframes = reset_indexes(dataframes)
+    dataframes[0]["direction"] = dataframes[0]["direction"] * dataframes[0]["event_type"].apply(
+        lambda x: -1 if x == 4 else 1)
+    return dataframes[1], dataframes[0]
+def unnormalize(x, mean, std):
+    return x * std + mean
+def one_hot_encoding_type(data):
+    encoded_data = torch.zeros(data.shape[0], data.shape[1] + 2, dtype=torch.float32)
+    encoded_data[:, 0] = data[:, 0]
+    # encoding order type
+    one_hot_order_type = torch.nn.functional.one_hot((data[:, 1]).to(torch.int64), num_classes=3).to(
+        torch.float32)
+    encoded_data[:, 1:4] = one_hot_order_type
+    encoded_data[:, 4:] = data[:, 2:]
+    return encoded_data
+def tanh_encoding_type(data):
+    data[:, 1] = torch.where(data[:, 1] == 1.0, 2.0, torch.where(data[:, 1] == 2.0, 1.0, data[:, 1]))
+    data[:, 1] = data[:, 1] - 1
+    return data
+def to_sparse_representation(lob, n_levels):
+    if not isinstance(lob, np.ndarray):
+        lob = np.array(lob)
+    sparse_lob = np.zeros(n_levels * 2)
+    for j in range(lob.shape[0] // 2):
+        if j % 2 == 0:
+            ask_price = lob[0]
+            current_ask_price = lob[j*2]
+            depth = (current_ask_price - ask_price) // 100
+            if depth < n_levels and int(lob[j*2]) != 0:
+                sparse_lob[2*int(depth)] = lob[j*2+1]
+        else:
+            bid_price = lob[2]
+            current_bid_price = lob[j*2]
+            depth = (bid_price - current_bid_price) // 100
+            if depth < n_levels and int(lob[j*2]) != 0:
+                sparse_lob[2*int(depth)+1] = lob[j*2+1]
+    return sparse_lob

utils/utils_model.py ADDED Viewed

	@@ -0,0 +1,18 @@

+from models.mlplob import MLPLOB
+from models.tlob import TLOB
+from models.binctabl import BiN_CTABL
+from models.deeplob import DeepLOB
+from transformers import AutoModelForSeq2SeqLM
+def pick_model(model_type, hidden_dim, num_layers, seq_size, num_features, num_heads=8, is_sin_emb=False, dataset_type=None):
+    if model_type == "MLPLOB":
+        return MLPLOB(hidden_dim, num_layers, seq_size, num_features, dataset_type)
+    elif model_type == "TLOB":
+        return TLOB(hidden_dim, num_layers, seq_size, num_features, num_heads, is_sin_emb, dataset_type)
+    elif model_type == "BINCTABL":
+        return BiN_CTABL(60, num_features, seq_size, seq_size, 120, 5, 3, 1)
+    elif model_type == "DEEPLOB":
+        return DeepLOB()
+    else:
+        raise ValueError("Model not found")

visualizations/__pycache__/attentions.cpython-311.pyc ADDED Viewed

Binary file (2.02 kB). View file

visualizations/attentions.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import matplotlib.pyplot as plt
+import numpy as np
+def plot_mean_att_distance(mean_att_dist):
+    'mean_att_dist shape: (num_layers, num_heads)'
+    num_layers = mean_att_dist.shape[0]
+    num_heads = mean_att_dist.shape[1]
+    # Create the plot
+    plt.figure(figsize=(10, 6))
+    for head in range(num_heads):
+        values = mean_att_dist[:, head]
+        plt.scatter(range(num_layers), values, label=f'Head {head}', s=20)
+    plt.xlabel('Network depth (layer)')
+    plt.ylabel('Mean attention distance (pixels)')
+    plt.xlim(0, num_layers - 1)
+    plt.ylim(0, 128)
+    # Customize legend
+    plt.legend(loc='lower right', ncol=2, fontsize='small')
+    # Add ellipsis to legend
+    handles, labels = plt.gca().get_legend_handles_labels()
+    handles.append(plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='gray', markersize=5))
+    labels.append('...')
+    plt.legend(handles, labels, loc='lower right', ncol=2, fontsize='small')
+    plt.tight_layout()
+    return plt