Upload folder using huggingface_hub
Browse files- apogee/data/aggregation.py +12 -0
- apogee/model.py +3 -1
- apogee/tokenizer.py +29 -16
- assets/candles_binance.BTCUSDT_1m.png +0 -0
- assets/candles_binance.BTCUSDT_8h.png +0 -0
- assets/candles_binance.DOGEUSDT_2h.png +0 -0
- ckpt.pt +2 -2
- handler.py +15 -6
apogee/data/aggregation.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
freq2sec = {
|
| 4 |
+
"1m": 1 * 60,
|
| 5 |
+
"5m": 5 * 60,
|
| 6 |
+
"30m": 30 * 60,
|
| 7 |
+
"2h": 2 * 60 * 60,
|
| 8 |
+
"8h": 8 * 60 * 60,
|
| 9 |
+
"1d": 24 * 60 * 60,
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
sec2freq = {v: k for k, v in freq2sec.items()}
|
apogee/model.py
CHANGED
|
@@ -108,6 +108,7 @@ class Block(torch.nn.Module):
|
|
| 108 |
|
| 109 |
@dataclass
|
| 110 |
class ModelConfig:
|
|
|
|
| 111 |
block_size: int
|
| 112 |
vocab_size: int
|
| 113 |
n_layer: int = 3
|
|
@@ -137,6 +138,7 @@ class GPT(torch.nn.Module):
|
|
| 137 |
wbe = torch.nn.Embedding(4, config.n_embd),
|
| 138 |
wce = torch.nn.Embedding(5, config.n_embd),
|
| 139 |
wpe = torch.nn.Embedding(config.block_size // 20, config.n_embd),
|
|
|
|
| 140 |
drop = torch.nn.Dropout(config.dropout),
|
| 141 |
h = torch.nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
|
| 142 |
ln_f = LayerNorm(config.n_embd, bias=config.bias),
|
|
@@ -186,7 +188,7 @@ class GPT(torch.nn.Module):
|
|
| 186 |
# forward the GPT model itself
|
| 187 |
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
|
| 188 |
# pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
|
| 189 |
-
pos_emb = self.transformer.wbe(pos % 4) + self.transformer.wce(pos // 4 % 5) + self.transformer.wpe(pos // 20)
|
| 190 |
x = self.transformer.drop(tok_emb + pos_emb)
|
| 191 |
for block in self.transformer.h:
|
| 192 |
x = block(x)
|
|
|
|
| 108 |
|
| 109 |
@dataclass
|
| 110 |
class ModelConfig:
|
| 111 |
+
meta_size: int
|
| 112 |
block_size: int
|
| 113 |
vocab_size: int
|
| 114 |
n_layer: int = 3
|
|
|
|
| 138 |
wbe = torch.nn.Embedding(4, config.n_embd),
|
| 139 |
wce = torch.nn.Embedding(5, config.n_embd),
|
| 140 |
wpe = torch.nn.Embedding(config.block_size // 20, config.n_embd),
|
| 141 |
+
wme = torch.nn.Embedding(config.meta_size, config.n_embd),
|
| 142 |
drop = torch.nn.Dropout(config.dropout),
|
| 143 |
h = torch.nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
|
| 144 |
ln_f = LayerNorm(config.n_embd, bias=config.bias),
|
|
|
|
| 188 |
# forward the GPT model itself
|
| 189 |
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
|
| 190 |
# pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
|
| 191 |
+
pos_emb = torch.cat([self.transformer.wme(pos[:self.config.meta_size]), self.transformer.wbe(pos[:-self.config.meta_size] % 4) + self.transformer.wce(pos[:-self.config.meta_size] // 4 % 5) + self.transformer.wpe(pos[:-self.config.meta_size] // 20)], dim=0)
|
| 192 |
x = self.transformer.drop(tok_emb + pos_emb)
|
| 193 |
for block in self.transformer.h:
|
| 194 |
x = block(x)
|
apogee/tokenizer.py
CHANGED
|
@@ -3,39 +3,52 @@ import torch
|
|
| 3 |
|
| 4 |
import numpy as np
|
| 5 |
|
| 6 |
-
from typing import Union
|
|
|
|
| 7 |
|
| 8 |
class Tokenizer:
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
def tokens_per_candle(self) -> int:
|
| 16 |
-
"""Return the number of tokens per candle"""
|
| 17 |
-
return 4 * 5
|
| 18 |
|
| 19 |
-
def encode(self, candles: Union[np.array, torch.Tensor]) -> torch.Tensor:
|
| 20 |
"""Tokenize candles into tokens."""
|
|
|
|
|
|
|
| 21 |
if isinstance(candles, np.ndarray): # Wrap into a tensor
|
| 22 |
candles = torch.tensor(candles)
|
| 23 |
candles = (candles.view(torch.int32) << 1).view(torch.float32) # Erase the sign bit to fit the exponent into the first byte
|
| 24 |
-
if sys.byteorder == 'little'
|
| 25 |
candles.untyped_storage().byteswap(torch.float32)
|
| 26 |
buffer = candles.view(torch.uint8) # Interpret the data as bytes ("tokenization" step)
|
| 27 |
buffer = buffer.view(-1).to(torch.uint16) # Flatten the data and convert to uint16 because otherwise <BOS> will overflow
|
| 28 |
-
buffer = torch.cat([torch.tensor([256], dtype=torch.uint16), buffer]) # Prepend <BOS> (Begin of Series) token
|
| 29 |
return buffer
|
| 30 |
|
| 31 |
-
def decode(self, tokens: torch.Tensor) -> torch.Tensor:
|
| 32 |
"""Decode tokens into candles."""
|
| 33 |
tokens = tokens.long()
|
| 34 |
-
candles_tokens = tokens[..., 1:] # Remove <BOS> token
|
| 35 |
candles_tokens = candles_tokens.to(torch.uint8).view(*tokens.shape[:-1], -1, self.tokens_per_candle) # Convert back to uint8 and reshape
|
| 36 |
candles_tokens = candles_tokens.view(torch.float32) # Interpret the data as floats
|
| 37 |
if sys.byteorder == 'little': # On little-endian systems, we need to byteswap the data back
|
| 38 |
# candles_tokens.untyped_storage().byteswap(torch.float32) # <-- This segfaults for some reason
|
| 39 |
candles_tokens = candles_tokens.view(torch.uint8).view(*candles_tokens.shape, 4).flip(-1).view(torch.float32).squeeze(-1)# Workaround
|
| 40 |
candles_tokens = -((candles_tokens.view(torch.int32) >> 1) | (1 << 31)).view(torch.float32) # Restore the sign bit
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
import numpy as np
|
| 5 |
|
| 6 |
+
from typing import List, Tuple, Union
|
| 7 |
+
from .data.aggregation import freq2sec
|
| 8 |
|
| 9 |
class Tokenizer:
|
| 10 |
+
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 " # Allowed characters in pair names
|
| 11 |
+
freqs = [k for k, _ in sorted(freq2sec.items(), key=lambda x: x[1])] # Sort the freq2sec by duration and extract the keys
|
| 12 |
+
pair_name_max_len: int = 10
|
| 13 |
+
vocabulary_size: int = 256 + 1 + len(letters) + len(freqs) # 256 possible bytes + 1 for <BOS> token + len(letters) for pair name
|
| 14 |
+
tokens_per_candle: int = 4*5
|
| 15 |
+
meta_context_len: int = 11
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
def encode(self, key: str, freq: str, candles: Union[np.array, torch.Tensor]) -> torch.Tensor:
|
| 18 |
"""Tokenize candles into tokens."""
|
| 19 |
+
_, pair = key.split(".") # Split the key into exchange and pair
|
| 20 |
+
meta = torch.tensor([257 + Tokenizer.letters.index(letter) for letter in pair.upper().ljust(self.pair_name_max_len)] + [257 + len(Tokenizer.letters) + Tokenizer.freqs.index(freq)], dtype=torch.uint16) # Encode the pair name and frequency
|
| 21 |
if isinstance(candles, np.ndarray): # Wrap into a tensor
|
| 22 |
candles = torch.tensor(candles)
|
| 23 |
candles = (candles.view(torch.int32) << 1).view(torch.float32) # Erase the sign bit to fit the exponent into the first byte
|
| 24 |
+
if sys.byteorder == 'little': # On little-endian systems, we need to byteswap the data so that msb is first
|
| 25 |
candles.untyped_storage().byteswap(torch.float32)
|
| 26 |
buffer = candles.view(torch.uint8) # Interpret the data as bytes ("tokenization" step)
|
| 27 |
buffer = buffer.view(-1).to(torch.uint16) # Flatten the data and convert to uint16 because otherwise <BOS> will overflow
|
| 28 |
+
buffer = torch.cat([meta, torch.tensor([256], dtype=torch.uint16), buffer]) # Prepend <BOS> (Begin of Series) token
|
| 29 |
return buffer
|
| 30 |
|
| 31 |
+
def decode(self, tokens: torch.Tensor) -> Tuple[Union[List[str], str], Union[List[str], str], torch.Tensor]:
|
| 32 |
"""Decode tokens into candles."""
|
| 33 |
tokens = tokens.long()
|
| 34 |
+
meta_tokens, candles_tokens = tokens[..., :Tokenizer.meta_context_len], tokens[..., Tokenizer.meta_context_len + 1:] # Remove <BOS> token
|
| 35 |
candles_tokens = candles_tokens.to(torch.uint8).view(*tokens.shape[:-1], -1, self.tokens_per_candle) # Convert back to uint8 and reshape
|
| 36 |
candles_tokens = candles_tokens.view(torch.float32) # Interpret the data as floats
|
| 37 |
if sys.byteorder == 'little': # On little-endian systems, we need to byteswap the data back
|
| 38 |
# candles_tokens.untyped_storage().byteswap(torch.float32) # <-- This segfaults for some reason
|
| 39 |
candles_tokens = candles_tokens.view(torch.uint8).view(*candles_tokens.shape, 4).flip(-1).view(torch.float32).squeeze(-1)# Workaround
|
| 40 |
candles_tokens = -((candles_tokens.view(torch.int32) >> 1) | (1 << 31)).view(torch.float32) # Restore the sign bit
|
| 41 |
+
if meta_tokens.ndim == 1:
|
| 42 |
+
meta_tokens = meta_tokens.unsqueeze(0)
|
| 43 |
+
squeeze = True
|
| 44 |
+
else:
|
| 45 |
+
squeeze = False
|
| 46 |
+
pair_meta, freq_meta = meta_tokens[..., :-1], meta_tokens[..., -1] # Extract pair and frequency tokens
|
| 47 |
+
pairs = []
|
| 48 |
+
freqs = []
|
| 49 |
+
for i in range(len(pair_meta)):
|
| 50 |
+
pairs.append("".join(Tokenizer.letters[token-257] for token in pair_meta[i].tolist()).rstrip(" "))
|
| 51 |
+
freqs.append(Tokenizer.freqs[freq_meta[i] - (257 + len(Tokenizer.letters))])
|
| 52 |
+
pair = pairs[0] if squeeze else pairs
|
| 53 |
+
freq = freqs[0] if squeeze else freqs
|
| 54 |
+
return pair, freq, candles_tokens
|
assets/candles_binance.BTCUSDT_1m.png
CHANGED
|
|
assets/candles_binance.BTCUSDT_8h.png
CHANGED
|
|
assets/candles_binance.DOGEUSDT_2h.png
CHANGED
|
|
ckpt.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2c9e6255ab969e62fd9e2b4fc5422af7d85ecd6148b823ab48ffa9845a745c5a
|
| 3 |
+
size 256702350
|
handler.py
CHANGED
|
@@ -48,13 +48,15 @@ class EndpointHandler:
|
|
| 48 |
self.model(torch.randint(0, self.tokenizer.vocabulary_size, (1, self.config.block_size), device=self.device))
|
| 49 |
print("Model ready ! ✅")
|
| 50 |
# Precompute useful values
|
| 51 |
-
self.max_candles = self.config.block_size // self.tokenizer.tokens_per_candle
|
| 52 |
|
| 53 |
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
| 54 |
"""
|
| 55 |
Args:
|
| 56 |
data (Dict[str, Any]):
|
| 57 |
-
inputs: Dict[str, List[float]] with keys:
|
|
|
|
|
|
|
| 58 |
timestamps: Timestamps of the time serie
|
| 59 |
open: Open prices
|
| 60 |
high: High prices
|
|
@@ -77,9 +79,13 @@ class EndpointHandler:
|
|
| 77 |
# Unpack input data
|
| 78 |
inputs = data.pop("inputs", data)
|
| 79 |
# Validate the inputs
|
| 80 |
-
assert "timestamps" in inputs and "open" in inputs and "high" in inputs and "low" in inputs and "close" in inputs and "volume" in inputs, "Required keys: timestamps, open, high, low, close, volume"
|
| 81 |
-
assert isinstance(inputs["timestamps"], list) and isinstance(inputs["open"], list) and isinstance(inputs["high"], list) and isinstance(inputs["low"], list) and isinstance(inputs["close"], list) and isinstance(inputs["volume"], list), "Inputs must be lists"
|
|
|
|
| 82 |
assert len(inputs["timestamps"]) == len(inputs["open"]) == len(inputs["high"]) == len(inputs["low"]) == len(inputs["close"]) == len(inputs["volume"]), "Inputs must have the same length"
|
|
|
|
|
|
|
|
|
|
| 83 |
timestamps = torch.tensor(inputs["timestamps"])
|
| 84 |
samples = torch.tensor([inputs["open"], inputs["high"], inputs["low"], inputs["close"], inputs["volume"]], dtype=torch.float32).T.contiguous()
|
| 85 |
steps = data.pop("steps", 4)
|
|
@@ -94,7 +100,7 @@ class EndpointHandler:
|
|
| 94 |
torch.cuda.manual_seed(seed)
|
| 95 |
# Generate scenarios
|
| 96 |
samples = samples[-self.max_candles + steps:] # Keep only the last candles that fit in the model's context
|
| 97 |
-
tokens = self.tokenizer.encode(samples) # Encode the samples into tokens
|
| 98 |
tokens = tokens.to(self.device).unsqueeze(0).long() # Add a batch dimension
|
| 99 |
with torch.no_grad(), self.ctx:
|
| 100 |
for _ in range(steps * self.tokenizer.tokens_per_candle):
|
|
@@ -112,7 +118,8 @@ class EndpointHandler:
|
|
| 112 |
# append sampled index to the running sequence and continue
|
| 113 |
tokens = torch.cat((tokens, next_tokens), dim=1)
|
| 114 |
# Decode the tokens back into samples
|
| 115 |
-
scenarios = self.tokenizer.decode(tokens)
|
|
|
|
| 116 |
print(f"Generated {n_scenarios} scenarios in {time.time() - t_start:.2f} seconds ⏱")
|
| 117 |
return {
|
| 118 |
"timestamps": (timestamps[-1] + torch.arange(1, steps+1) * torch.median(torch.diff(timestamps)).item()).tolist(),
|
|
@@ -131,6 +138,8 @@ if __name__ == "__main__":
|
|
| 131 |
data = pd.read_csv(f)
|
| 132 |
y = handler({
|
| 133 |
"inputs": {
|
|
|
|
|
|
|
| 134 |
"timestamps": data[data.columns[0]].tolist(),
|
| 135 |
"open": data[data.columns[1]].tolist(),
|
| 136 |
"high": data[data.columns[2]].tolist(),
|
|
|
|
| 48 |
self.model(torch.randint(0, self.tokenizer.vocabulary_size, (1, self.config.block_size), device=self.device))
|
| 49 |
print("Model ready ! ✅")
|
| 50 |
# Precompute useful values
|
| 51 |
+
self.max_candles = (self.config.block_size - self.config.meta_size) // self.tokenizer.tokens_per_candle
|
| 52 |
|
| 53 |
def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
|
| 54 |
"""
|
| 55 |
Args:
|
| 56 |
data (Dict[str, Any]):
|
| 57 |
+
inputs: Dict[str, Union[str, List[float]]] with keys:
|
| 58 |
+
pair: Pair symbol
|
| 59 |
+
frequency: Frequency of the time serie (1m, 5m, 30m, 2h, 8h, 1d)
|
| 60 |
timestamps: Timestamps of the time serie
|
| 61 |
open: Open prices
|
| 62 |
high: High prices
|
|
|
|
| 79 |
# Unpack input data
|
| 80 |
inputs = data.pop("inputs", data)
|
| 81 |
# Validate the inputs
|
| 82 |
+
assert "pair" in inputs and "frequency" in inputs and "timestamps" in inputs and "open" in inputs and "high" in inputs and "low" in inputs and "close" in inputs and "volume" in inputs, "Required keys: pair, frequency, timestamps, open, high, low, close, volume"
|
| 83 |
+
assert isinstance(inputs["pair"], str) and isinstance(inputs["frequency"], str) and isinstance(inputs["timestamps"], list) and isinstance(inputs["open"], list) and isinstance(inputs["high"], list) and isinstance(inputs["low"], list) and isinstance(inputs["close"], list) and isinstance(inputs["volume"], list), "Inputs must be lists"
|
| 84 |
+
assert inputs["frequency"] in ["1m", "5m", "30m", "2h", "8h", "1d"], "Invalid frequency"
|
| 85 |
assert len(inputs["timestamps"]) == len(inputs["open"]) == len(inputs["high"]) == len(inputs["low"]) == len(inputs["close"]) == len(inputs["volume"]), "Inputs must have the same length"
|
| 86 |
+
pair, freq = inputs["pair"], inputs["frequency"]
|
| 87 |
+
pair = "".join(pair.split("/"))
|
| 88 |
+
pair = f"binance.{pair.upper()}" if "." not in pair else pair
|
| 89 |
timestamps = torch.tensor(inputs["timestamps"])
|
| 90 |
samples = torch.tensor([inputs["open"], inputs["high"], inputs["low"], inputs["close"], inputs["volume"]], dtype=torch.float32).T.contiguous()
|
| 91 |
steps = data.pop("steps", 4)
|
|
|
|
| 100 |
torch.cuda.manual_seed(seed)
|
| 101 |
# Generate scenarios
|
| 102 |
samples = samples[-self.max_candles + steps:] # Keep only the last candles that fit in the model's context
|
| 103 |
+
tokens = self.tokenizer.encode(pair, freq, samples) # Encode the samples into tokens
|
| 104 |
tokens = tokens.to(self.device).unsqueeze(0).long() # Add a batch dimension
|
| 105 |
with torch.no_grad(), self.ctx:
|
| 106 |
for _ in range(steps * self.tokenizer.tokens_per_candle):
|
|
|
|
| 118 |
# append sampled index to the running sequence and continue
|
| 119 |
tokens = torch.cat((tokens, next_tokens), dim=1)
|
| 120 |
# Decode the tokens back into samples
|
| 121 |
+
_, _, scenarios = self.tokenizer.decode(tokens)
|
| 122 |
+
scenarios = scenarios[:, -steps:]
|
| 123 |
print(f"Generated {n_scenarios} scenarios in {time.time() - t_start:.2f} seconds ⏱")
|
| 124 |
return {
|
| 125 |
"timestamps": (timestamps[-1] + torch.arange(1, steps+1) * torch.median(torch.diff(timestamps)).item()).tolist(),
|
|
|
|
| 138 |
data = pd.read_csv(f)
|
| 139 |
y = handler({
|
| 140 |
"inputs": {
|
| 141 |
+
"pair": "binance.BTCUSDT",
|
| 142 |
+
"frequency": "1m",
|
| 143 |
"timestamps": data[data.columns[0]].tolist(),
|
| 144 |
"open": data[data.columns[1]].tolist(),
|
| 145 |
"high": data[data.columns[2]].tolist(),
|