Theob commited on
Commit
8ecbf61
·
verified ·
1 Parent(s): c46eb4d

Upload folder using huggingface_hub

Browse files
apogee/data/aggregation.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ freq2sec = {
4
+ "1m": 1 * 60,
5
+ "5m": 5 * 60,
6
+ "30m": 30 * 60,
7
+ "2h": 2 * 60 * 60,
8
+ "8h": 8 * 60 * 60,
9
+ "1d": 24 * 60 * 60,
10
+ }
11
+
12
+ sec2freq = {v: k for k, v in freq2sec.items()}
apogee/model.py CHANGED
@@ -108,6 +108,7 @@ class Block(torch.nn.Module):
108
 
109
  @dataclass
110
  class ModelConfig:
 
111
  block_size: int
112
  vocab_size: int
113
  n_layer: int = 3
@@ -137,6 +138,7 @@ class GPT(torch.nn.Module):
137
  wbe = torch.nn.Embedding(4, config.n_embd),
138
  wce = torch.nn.Embedding(5, config.n_embd),
139
  wpe = torch.nn.Embedding(config.block_size // 20, config.n_embd),
 
140
  drop = torch.nn.Dropout(config.dropout),
141
  h = torch.nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
142
  ln_f = LayerNorm(config.n_embd, bias=config.bias),
@@ -186,7 +188,7 @@ class GPT(torch.nn.Module):
186
  # forward the GPT model itself
187
  tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
188
  # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
189
- pos_emb = self.transformer.wbe(pos % 4) + self.transformer.wce(pos // 4 % 5) + self.transformer.wpe(pos // 20)
190
  x = self.transformer.drop(tok_emb + pos_emb)
191
  for block in self.transformer.h:
192
  x = block(x)
 
108
 
109
  @dataclass
110
  class ModelConfig:
111
+ meta_size: int
112
  block_size: int
113
  vocab_size: int
114
  n_layer: int = 3
 
138
  wbe = torch.nn.Embedding(4, config.n_embd),
139
  wce = torch.nn.Embedding(5, config.n_embd),
140
  wpe = torch.nn.Embedding(config.block_size // 20, config.n_embd),
141
+ wme = torch.nn.Embedding(config.meta_size, config.n_embd),
142
  drop = torch.nn.Dropout(config.dropout),
143
  h = torch.nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
144
  ln_f = LayerNorm(config.n_embd, bias=config.bias),
 
188
  # forward the GPT model itself
189
  tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
190
  # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
191
+ pos_emb = torch.cat([self.transformer.wme(pos[:self.config.meta_size]), self.transformer.wbe(pos[:-self.config.meta_size] % 4) + self.transformer.wce(pos[:-self.config.meta_size] // 4 % 5) + self.transformer.wpe(pos[:-self.config.meta_size] // 20)], dim=0)
192
  x = self.transformer.drop(tok_emb + pos_emb)
193
  for block in self.transformer.h:
194
  x = block(x)
apogee/tokenizer.py CHANGED
@@ -3,39 +3,52 @@ import torch
3
 
4
  import numpy as np
5
 
6
- from typing import Union
 
7
 
8
  class Tokenizer:
9
- @property
10
- def vocabulary_size(self) -> int:
11
- """Return the size of the vocabulary"""
12
- return 257
13
-
14
- @property
15
- def tokens_per_candle(self) -> int:
16
- """Return the number of tokens per candle"""
17
- return 4 * 5
18
 
19
- def encode(self, candles: Union[np.array, torch.Tensor]) -> torch.Tensor:
20
  """Tokenize candles into tokens."""
 
 
21
  if isinstance(candles, np.ndarray): # Wrap into a tensor
22
  candles = torch.tensor(candles)
23
  candles = (candles.view(torch.int32) << 1).view(torch.float32) # Erase the sign bit to fit the exponent into the first byte
24
- if sys.byteorder == 'little':# On little-endian systems, we need to byteswap the data so that msb is first
25
  candles.untyped_storage().byteswap(torch.float32)
26
  buffer = candles.view(torch.uint8) # Interpret the data as bytes ("tokenization" step)
27
  buffer = buffer.view(-1).to(torch.uint16) # Flatten the data and convert to uint16 because otherwise <BOS> will overflow
28
- buffer = torch.cat([torch.tensor([256], dtype=torch.uint16), buffer]) # Prepend <BOS> (Begin of Series) token
29
  return buffer
30
 
31
- def decode(self, tokens: torch.Tensor) -> torch.Tensor:
32
  """Decode tokens into candles."""
33
  tokens = tokens.long()
34
- candles_tokens = tokens[..., 1:] # Remove <BOS> token
35
  candles_tokens = candles_tokens.to(torch.uint8).view(*tokens.shape[:-1], -1, self.tokens_per_candle) # Convert back to uint8 and reshape
36
  candles_tokens = candles_tokens.view(torch.float32) # Interpret the data as floats
37
  if sys.byteorder == 'little': # On little-endian systems, we need to byteswap the data back
38
  # candles_tokens.untyped_storage().byteswap(torch.float32) # <-- This segfaults for some reason
39
  candles_tokens = candles_tokens.view(torch.uint8).view(*candles_tokens.shape, 4).flip(-1).view(torch.float32).squeeze(-1)# Workaround
40
  candles_tokens = -((candles_tokens.view(torch.int32) >> 1) | (1 << 31)).view(torch.float32) # Restore the sign bit
41
- return candles_tokens
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  import numpy as np
5
 
6
+ from typing import List, Tuple, Union
7
+ from .data.aggregation import freq2sec
8
 
9
  class Tokenizer:
10
+ letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 " # Allowed characters in pair names
11
+ freqs = [k for k, _ in sorted(freq2sec.items(), key=lambda x: x[1])] # Sort the freq2sec by duration and extract the keys
12
+ pair_name_max_len: int = 10
13
+ vocabulary_size: int = 256 + 1 + len(letters) + len(freqs) # 256 possible bytes + 1 for <BOS> token + len(letters) for pair name
14
+ tokens_per_candle: int = 4*5
15
+ meta_context_len: int = 11
 
 
 
16
 
17
+ def encode(self, key: str, freq: str, candles: Union[np.array, torch.Tensor]) -> torch.Tensor:
18
  """Tokenize candles into tokens."""
19
+ _, pair = key.split(".") # Split the key into exchange and pair
20
+ meta = torch.tensor([257 + Tokenizer.letters.index(letter) for letter in pair.upper().ljust(self.pair_name_max_len)] + [257 + len(Tokenizer.letters) + Tokenizer.freqs.index(freq)], dtype=torch.uint16) # Encode the pair name and frequency
21
  if isinstance(candles, np.ndarray): # Wrap into a tensor
22
  candles = torch.tensor(candles)
23
  candles = (candles.view(torch.int32) << 1).view(torch.float32) # Erase the sign bit to fit the exponent into the first byte
24
+ if sys.byteorder == 'little': # On little-endian systems, we need to byteswap the data so that msb is first
25
  candles.untyped_storage().byteswap(torch.float32)
26
  buffer = candles.view(torch.uint8) # Interpret the data as bytes ("tokenization" step)
27
  buffer = buffer.view(-1).to(torch.uint16) # Flatten the data and convert to uint16 because otherwise <BOS> will overflow
28
+ buffer = torch.cat([meta, torch.tensor([256], dtype=torch.uint16), buffer]) # Prepend <BOS> (Begin of Series) token
29
  return buffer
30
 
31
+ def decode(self, tokens: torch.Tensor) -> Tuple[Union[List[str], str], Union[List[str], str], torch.Tensor]:
32
  """Decode tokens into candles."""
33
  tokens = tokens.long()
34
+ meta_tokens, candles_tokens = tokens[..., :Tokenizer.meta_context_len], tokens[..., Tokenizer.meta_context_len + 1:] # Remove <BOS> token
35
  candles_tokens = candles_tokens.to(torch.uint8).view(*tokens.shape[:-1], -1, self.tokens_per_candle) # Convert back to uint8 and reshape
36
  candles_tokens = candles_tokens.view(torch.float32) # Interpret the data as floats
37
  if sys.byteorder == 'little': # On little-endian systems, we need to byteswap the data back
38
  # candles_tokens.untyped_storage().byteswap(torch.float32) # <-- This segfaults for some reason
39
  candles_tokens = candles_tokens.view(torch.uint8).view(*candles_tokens.shape, 4).flip(-1).view(torch.float32).squeeze(-1)# Workaround
40
  candles_tokens = -((candles_tokens.view(torch.int32) >> 1) | (1 << 31)).view(torch.float32) # Restore the sign bit
41
+ if meta_tokens.ndim == 1:
42
+ meta_tokens = meta_tokens.unsqueeze(0)
43
+ squeeze = True
44
+ else:
45
+ squeeze = False
46
+ pair_meta, freq_meta = meta_tokens[..., :-1], meta_tokens[..., -1] # Extract pair and frequency tokens
47
+ pairs = []
48
+ freqs = []
49
+ for i in range(len(pair_meta)):
50
+ pairs.append("".join(Tokenizer.letters[token-257] for token in pair_meta[i].tolist()).rstrip(" "))
51
+ freqs.append(Tokenizer.freqs[freq_meta[i] - (257 + len(Tokenizer.letters))])
52
+ pair = pairs[0] if squeeze else pairs
53
+ freq = freqs[0] if squeeze else freqs
54
+ return pair, freq, candles_tokens
assets/candles_binance.BTCUSDT_1m.png CHANGED
assets/candles_binance.BTCUSDT_8h.png CHANGED
assets/candles_binance.DOGEUSDT_2h.png CHANGED
ckpt.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b4e2b9a1f19de392b95c9d86be8f175e613cc27454a8d7a47f7dfdc4f3a8ab3d
3
- size 256452450
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c9e6255ab969e62fd9e2b4fc5422af7d85ecd6148b823ab48ffa9845a745c5a
3
+ size 256702350
handler.py CHANGED
@@ -48,13 +48,15 @@ class EndpointHandler:
48
  self.model(torch.randint(0, self.tokenizer.vocabulary_size, (1, self.config.block_size), device=self.device))
49
  print("Model ready ! ✅")
50
  # Precompute useful values
51
- self.max_candles = self.config.block_size // self.tokenizer.tokens_per_candle
52
 
53
  def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
54
  """
55
  Args:
56
  data (Dict[str, Any]):
57
- inputs: Dict[str, List[float]] with keys:
 
 
58
  timestamps: Timestamps of the time serie
59
  open: Open prices
60
  high: High prices
@@ -77,9 +79,13 @@ class EndpointHandler:
77
  # Unpack input data
78
  inputs = data.pop("inputs", data)
79
  # Validate the inputs
80
- assert "timestamps" in inputs and "open" in inputs and "high" in inputs and "low" in inputs and "close" in inputs and "volume" in inputs, "Required keys: timestamps, open, high, low, close, volume"
81
- assert isinstance(inputs["timestamps"], list) and isinstance(inputs["open"], list) and isinstance(inputs["high"], list) and isinstance(inputs["low"], list) and isinstance(inputs["close"], list) and isinstance(inputs["volume"], list), "Inputs must be lists"
 
82
  assert len(inputs["timestamps"]) == len(inputs["open"]) == len(inputs["high"]) == len(inputs["low"]) == len(inputs["close"]) == len(inputs["volume"]), "Inputs must have the same length"
 
 
 
83
  timestamps = torch.tensor(inputs["timestamps"])
84
  samples = torch.tensor([inputs["open"], inputs["high"], inputs["low"], inputs["close"], inputs["volume"]], dtype=torch.float32).T.contiguous()
85
  steps = data.pop("steps", 4)
@@ -94,7 +100,7 @@ class EndpointHandler:
94
  torch.cuda.manual_seed(seed)
95
  # Generate scenarios
96
  samples = samples[-self.max_candles + steps:] # Keep only the last candles that fit in the model's context
97
- tokens = self.tokenizer.encode(samples) # Encode the samples into tokens
98
  tokens = tokens.to(self.device).unsqueeze(0).long() # Add a batch dimension
99
  with torch.no_grad(), self.ctx:
100
  for _ in range(steps * self.tokenizer.tokens_per_candle):
@@ -112,7 +118,8 @@ class EndpointHandler:
112
  # append sampled index to the running sequence and continue
113
  tokens = torch.cat((tokens, next_tokens), dim=1)
114
  # Decode the tokens back into samples
115
- scenarios = self.tokenizer.decode(tokens)[:, -steps:]
 
116
  print(f"Generated {n_scenarios} scenarios in {time.time() - t_start:.2f} seconds ⏱")
117
  return {
118
  "timestamps": (timestamps[-1] + torch.arange(1, steps+1) * torch.median(torch.diff(timestamps)).item()).tolist(),
@@ -131,6 +138,8 @@ if __name__ == "__main__":
131
  data = pd.read_csv(f)
132
  y = handler({
133
  "inputs": {
 
 
134
  "timestamps": data[data.columns[0]].tolist(),
135
  "open": data[data.columns[1]].tolist(),
136
  "high": data[data.columns[2]].tolist(),
 
48
  self.model(torch.randint(0, self.tokenizer.vocabulary_size, (1, self.config.block_size), device=self.device))
49
  print("Model ready ! ✅")
50
  # Precompute useful values
51
+ self.max_candles = (self.config.block_size - self.config.meta_size) // self.tokenizer.tokens_per_candle
52
 
53
  def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
54
  """
55
  Args:
56
  data (Dict[str, Any]):
57
+ inputs: Dict[str, Union[str, List[float]]] with keys:
58
+ pair: Pair symbol
59
+ frequency: Frequency of the time serie (1m, 5m, 30m, 2h, 8h, 1d)
60
  timestamps: Timestamps of the time serie
61
  open: Open prices
62
  high: High prices
 
79
  # Unpack input data
80
  inputs = data.pop("inputs", data)
81
  # Validate the inputs
82
+ assert "pair" in inputs and "frequency" in inputs and "timestamps" in inputs and "open" in inputs and "high" in inputs and "low" in inputs and "close" in inputs and "volume" in inputs, "Required keys: pair, frequency, timestamps, open, high, low, close, volume"
83
+ assert isinstance(inputs["pair"], str) and isinstance(inputs["frequency"], str) and isinstance(inputs["timestamps"], list) and isinstance(inputs["open"], list) and isinstance(inputs["high"], list) and isinstance(inputs["low"], list) and isinstance(inputs["close"], list) and isinstance(inputs["volume"], list), "Inputs must be lists"
84
+ assert inputs["frequency"] in ["1m", "5m", "30m", "2h", "8h", "1d"], "Invalid frequency"
85
  assert len(inputs["timestamps"]) == len(inputs["open"]) == len(inputs["high"]) == len(inputs["low"]) == len(inputs["close"]) == len(inputs["volume"]), "Inputs must have the same length"
86
+ pair, freq = inputs["pair"], inputs["frequency"]
87
+ pair = "".join(pair.split("/"))
88
+ pair = f"binance.{pair.upper()}" if "." not in pair else pair
89
  timestamps = torch.tensor(inputs["timestamps"])
90
  samples = torch.tensor([inputs["open"], inputs["high"], inputs["low"], inputs["close"], inputs["volume"]], dtype=torch.float32).T.contiguous()
91
  steps = data.pop("steps", 4)
 
100
  torch.cuda.manual_seed(seed)
101
  # Generate scenarios
102
  samples = samples[-self.max_candles + steps:] # Keep only the last candles that fit in the model's context
103
+ tokens = self.tokenizer.encode(pair, freq, samples) # Encode the samples into tokens
104
  tokens = tokens.to(self.device).unsqueeze(0).long() # Add a batch dimension
105
  with torch.no_grad(), self.ctx:
106
  for _ in range(steps * self.tokenizer.tokens_per_candle):
 
118
  # append sampled index to the running sequence and continue
119
  tokens = torch.cat((tokens, next_tokens), dim=1)
120
  # Decode the tokens back into samples
121
+ _, _, scenarios = self.tokenizer.decode(tokens)
122
+ scenarios = scenarios[:, -steps:]
123
  print(f"Generated {n_scenarios} scenarios in {time.time() - t_start:.2f} seconds ⏱")
124
  return {
125
  "timestamps": (timestamps[-1] + torch.arange(1, steps+1) * torch.median(torch.diff(timestamps)).item()).tolist(),
 
138
  data = pd.read_csv(f)
139
  y = handler({
140
  "inputs": {
141
+ "pair": "binance.BTCUSDT",
142
+ "frequency": "1m",
143
  "timestamps": data[data.columns[0]].tolist(),
144
  "open": data[data.columns[1]].tolist(),
145
  "high": data[data.columns[2]].tolist(),