Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

apogee/data/aggregation.py +12 -0
apogee/model.py +3 -1
apogee/tokenizer.py +29 -16
assets/candles_binance.BTCUSDT_1m.png +0 -0
assets/candles_binance.BTCUSDT_8h.png +0 -0
assets/candles_binance.DOGEUSDT_2h.png +0 -0
ckpt.pt +2 -2
handler.py +15 -6

apogee/data/aggregation.py ADDED Viewed

	@@ -0,0 +1,12 @@

+freq2sec = {
+    "1m": 1 * 60,
+    "5m": 5 * 60,
+    "30m": 30 * 60,
+    "2h": 2 * 60 * 60,
+    "8h": 8 * 60 * 60,
+    "1d": 24 * 60 * 60,
+}
+sec2freq = {v: k for k, v in freq2sec.items()}

apogee/model.py CHANGED Viewed

@@ -108,6 +108,7 @@ class Block(torch.nn.Module):
 @dataclass
 class ModelConfig:
     block_size: int
     vocab_size: int
     n_layer: int = 3
@@ -137,6 +138,7 @@ class GPT(torch.nn.Module):
             wbe = torch.nn.Embedding(4, config.n_embd),
             wce = torch.nn.Embedding(5, config.n_embd),
             wpe = torch.nn.Embedding(config.block_size // 20, config.n_embd),
             drop = torch.nn.Dropout(config.dropout),
             h = torch.nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
             ln_f = LayerNorm(config.n_embd, bias=config.bias),
@@ -186,7 +188,7 @@ class GPT(torch.nn.Module):
         # forward the GPT model itself
         tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
         # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
-        pos_emb = self.transformer.wbe(pos % 4) + self.transformer.wce(pos // 4 % 5) + self.transformer.wpe(pos // 20)
         x = self.transformer.drop(tok_emb + pos_emb)
         for block in self.transformer.h:
             x = block(x)

 @dataclass
 class ModelConfig:
+    meta_size: int
     block_size: int
     vocab_size: int
     n_layer: int = 3
             wbe = torch.nn.Embedding(4, config.n_embd),
             wce = torch.nn.Embedding(5, config.n_embd),
             wpe = torch.nn.Embedding(config.block_size // 20, config.n_embd),
+            wme = torch.nn.Embedding(config.meta_size, config.n_embd),
             drop = torch.nn.Dropout(config.dropout),
             h = torch.nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
             ln_f = LayerNorm(config.n_embd, bias=config.bias),
         # forward the GPT model itself
         tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
         # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
+        pos_emb = torch.cat([self.transformer.wme(pos[:self.config.meta_size]), self.transformer.wbe(pos[:-self.config.meta_size] % 4) + self.transformer.wce(pos[:-self.config.meta_size] // 4 % 5) + self.transformer.wpe(pos[:-self.config.meta_size] // 20)], dim=0)
         x = self.transformer.drop(tok_emb + pos_emb)
         for block in self.transformer.h:
             x = block(x)

apogee/tokenizer.py CHANGED Viewed

@@ -3,39 +3,52 @@ import torch
 import numpy as np
-from typing import Union
 class Tokenizer:
-    @property
-    def vocabulary_size(self) -> int:
-        """Return the size of the vocabulary"""
-        return 257
-    @property
-    def tokens_per_candle(self) -> int:
-        """Return the number of tokens per candle"""
-        return 4 * 5
-    def encode(self, candles: Union[np.array, torch.Tensor]) -> torch.Tensor:
         """Tokenize candles into tokens."""
         if isinstance(candles, np.ndarray): # Wrap into a tensor
             candles = torch.tensor(candles)
         candles = (candles.view(torch.int32) << 1).view(torch.float32) # Erase the sign bit to fit the exponent into the first byte
-        if sys.byteorder == 'little':# On little-endian systems, we need to byteswap the data so that msb is first
             candles.untyped_storage().byteswap(torch.float32)
         buffer = candles.view(torch.uint8) # Interpret the data as bytes ("tokenization" step)
         buffer = buffer.view(-1).to(torch.uint16) # Flatten the data and convert to uint16 because otherwise <BOS> will overflow
-        buffer = torch.cat([torch.tensor([256], dtype=torch.uint16), buffer]) # Prepend <BOS> (Begin of Series) token
         return buffer
-    def decode(self, tokens: torch.Tensor) -> torch.Tensor:
         """Decode tokens into candles."""
         tokens = tokens.long()
-        candles_tokens = tokens[..., 1:] # Remove <BOS> token
         candles_tokens = candles_tokens.to(torch.uint8).view(*tokens.shape[:-1], -1, self.tokens_per_candle) # Convert back to uint8 and reshape
         candles_tokens = candles_tokens.view(torch.float32) # Interpret the data as floats
         if sys.byteorder == 'little': # On little-endian systems, we need to byteswap the data back
             # candles_tokens.untyped_storage().byteswap(torch.float32) # <-- This segfaults for some reason
             candles_tokens = candles_tokens.view(torch.uint8).view(*candles_tokens.shape, 4).flip(-1).view(torch.float32).squeeze(-1)# Workaround
         candles_tokens = -((candles_tokens.view(torch.int32) >> 1) | (1 << 31)).view(torch.float32) # Restore the sign bit
-        return candles_tokens

 import numpy as np
+from typing import List, Tuple, Union
+from .data.aggregation import freq2sec
 class Tokenizer:
+    letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 " # Allowed characters in pair names
+    freqs = [k for k, _ in sorted(freq2sec.items(), key=lambda x: x[1])] # Sort the freq2sec by duration and extract the keys
+    pair_name_max_len: int = 10
+    vocabulary_size: int = 256 + 1 + len(letters) + len(freqs) # 256 possible bytes + 1 for <BOS> token + len(letters) for pair name
+    tokens_per_candle: int = 4*5
+    meta_context_len: int = 11
+    def encode(self, key: str, freq: str, candles: Union[np.array, torch.Tensor]) -> torch.Tensor:
         """Tokenize candles into tokens."""
+        _, pair = key.split(".") # Split the key into exchange and pair
+        meta = torch.tensor([257 + Tokenizer.letters.index(letter) for letter in pair.upper().ljust(self.pair_name_max_len)] + [257 + len(Tokenizer.letters) + Tokenizer.freqs.index(freq)], dtype=torch.uint16) # Encode the pair name and frequency
         if isinstance(candles, np.ndarray): # Wrap into a tensor
             candles = torch.tensor(candles)
         candles = (candles.view(torch.int32) << 1).view(torch.float32) # Erase the sign bit to fit the exponent into the first byte
+        if sys.byteorder == 'little': # On little-endian systems, we need to byteswap the data so that msb is first
             candles.untyped_storage().byteswap(torch.float32)
         buffer = candles.view(torch.uint8) # Interpret the data as bytes ("tokenization" step)
         buffer = buffer.view(-1).to(torch.uint16) # Flatten the data and convert to uint16 because otherwise <BOS> will overflow
+        buffer = torch.cat([meta, torch.tensor([256], dtype=torch.uint16), buffer]) # Prepend <BOS> (Begin of Series) token
         return buffer
+    def decode(self, tokens: torch.Tensor) -> Tuple[Union[List[str], str], Union[List[str], str], torch.Tensor]:
         """Decode tokens into candles."""
         tokens = tokens.long()
+        meta_tokens, candles_tokens = tokens[..., :Tokenizer.meta_context_len], tokens[..., Tokenizer.meta_context_len + 1:] # Remove <BOS> token
         candles_tokens = candles_tokens.to(torch.uint8).view(*tokens.shape[:-1], -1, self.tokens_per_candle) # Convert back to uint8 and reshape
         candles_tokens = candles_tokens.view(torch.float32) # Interpret the data as floats
         if sys.byteorder == 'little': # On little-endian systems, we need to byteswap the data back
             # candles_tokens.untyped_storage().byteswap(torch.float32) # <-- This segfaults for some reason
             candles_tokens = candles_tokens.view(torch.uint8).view(*candles_tokens.shape, 4).flip(-1).view(torch.float32).squeeze(-1)# Workaround
         candles_tokens = -((candles_tokens.view(torch.int32) >> 1) | (1 << 31)).view(torch.float32) # Restore the sign bit
+        if meta_tokens.ndim == 1:
+            meta_tokens = meta_tokens.unsqueeze(0)
+            squeeze = True
+        else:
+            squeeze = False
+        pair_meta, freq_meta = meta_tokens[..., :-1], meta_tokens[..., -1] # Extract pair and frequency tokens
+        pairs = []
+        freqs = []
+        for i in range(len(pair_meta)):
+            pairs.append("".join(Tokenizer.letters[token-257] for token in pair_meta[i].tolist()).rstrip(" "))
+            freqs.append(Tokenizer.freqs[freq_meta[i] - (257 + len(Tokenizer.letters))])
+        pair = pairs[0] if squeeze else pairs
+        freq = freqs[0] if squeeze else freqs
+        return pair, freq, candles_tokens

assets/candles_binance.BTCUSDT_1m.png CHANGED Viewed

assets/candles_binance.BTCUSDT_8h.png CHANGED Viewed

assets/candles_binance.DOGEUSDT_2h.png CHANGED Viewed

ckpt.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b4e2b9a1f19de392b95c9d86be8f175e613cc27454a8d7a47f7dfdc4f3a8ab3d
-size 256452450

 version https://git-lfs.github.com/spec/v1
+oid sha256:2c9e6255ab969e62fd9e2b4fc5422af7d85ecd6148b823ab48ffa9845a745c5a
+size 256702350

handler.py CHANGED Viewed

@@ -48,13 +48,15 @@ class EndpointHandler:
             self.model(torch.randint(0, self.tokenizer.vocabulary_size, (1, self.config.block_size), device=self.device))
         print("Model ready ! ✅")
         # Precompute useful values
-        self.max_candles = self.config.block_size // self.tokenizer.tokens_per_candle
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         Args:
             data (Dict[str, Any]):
-                inputs: Dict[str, List[float]] with keys:
                     timestamps: Timestamps of the time serie
                     open: Open prices
                     high: High prices
@@ -77,9 +79,13 @@ class EndpointHandler:
         # Unpack input data
         inputs = data.pop("inputs", data)
         # Validate the inputs
-        assert "timestamps" in inputs and "open" in inputs and "high" in inputs and "low" in inputs and "close" in inputs and "volume" in inputs, "Required keys: timestamps, open, high, low, close, volume"
-        assert isinstance(inputs["timestamps"], list) and isinstance(inputs["open"], list) and isinstance(inputs["high"], list) and isinstance(inputs["low"], list) and isinstance(inputs["close"], list) and isinstance(inputs["volume"], list), "Inputs must be lists"
         assert len(inputs["timestamps"]) == len(inputs["open"]) == len(inputs["high"]) == len(inputs["low"]) == len(inputs["close"]) == len(inputs["volume"]), "Inputs must have the same length"
         timestamps = torch.tensor(inputs["timestamps"])
         samples = torch.tensor([inputs["open"], inputs["high"], inputs["low"], inputs["close"], inputs["volume"]], dtype=torch.float32).T.contiguous()
         steps = data.pop("steps", 4)
@@ -94,7 +100,7 @@ class EndpointHandler:
             torch.cuda.manual_seed(seed)
         # Generate scenarios
         samples = samples[-self.max_candles + steps:] # Keep only the last candles that fit in the model's context
-        tokens = self.tokenizer.encode(samples) # Encode the samples into tokens
         tokens = tokens.to(self.device).unsqueeze(0).long() # Add a batch dimension
         with torch.no_grad(), self.ctx:
             for _ in range(steps * self.tokenizer.tokens_per_candle):
@@ -112,7 +118,8 @@ class EndpointHandler:
                 # append sampled index to the running sequence and continue
                 tokens = torch.cat((tokens, next_tokens), dim=1)
         # Decode the tokens back into samples
-        scenarios = self.tokenizer.decode(tokens)[:, -steps:]
         print(f"Generated {n_scenarios} scenarios in {time.time() - t_start:.2f} seconds ⏱")
         return {
             "timestamps": (timestamps[-1] + torch.arange(1, steps+1) * torch.median(torch.diff(timestamps)).item()).tolist(),
@@ -131,6 +138,8 @@ if __name__ == "__main__":
         data = pd.read_csv(f)
     y = handler({
         "inputs": {
             "timestamps": data[data.columns[0]].tolist(),
             "open": data[data.columns[1]].tolist(),
             "high": data[data.columns[2]].tolist(),

             self.model(torch.randint(0, self.tokenizer.vocabulary_size, (1, self.config.block_size), device=self.device))
         print("Model ready ! ✅")
         # Precompute useful values
+        self.max_candles = (self.config.block_size - self.config.meta_size) // self.tokenizer.tokens_per_candle
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         """
         Args:
             data (Dict[str, Any]):
+                inputs: Dict[str, Union[str, List[float]]] with keys:
+                    pair: Pair symbol
+                    frequency: Frequency of the time serie (1m, 5m, 30m, 2h, 8h, 1d)
                     timestamps: Timestamps of the time serie
                     open: Open prices
                     high: High prices
         # Unpack input data
         inputs = data.pop("inputs", data)
         # Validate the inputs
+        assert "pair" in inputs and "frequency" in inputs and "timestamps" in inputs and "open" in inputs and "high" in inputs and "low" in inputs and "close" in inputs and "volume" in inputs, "Required keys: pair, frequency, timestamps, open, high, low, close, volume"
+        assert isinstance(inputs["pair"], str) and isinstance(inputs["frequency"], str) and isinstance(inputs["timestamps"], list) and isinstance(inputs["open"], list) and isinstance(inputs["high"], list) and isinstance(inputs["low"], list) and isinstance(inputs["close"], list) and isinstance(inputs["volume"], list), "Inputs must be lists"
+        assert inputs["frequency"] in ["1m", "5m", "30m", "2h", "8h", "1d"], "Invalid frequency"
         assert len(inputs["timestamps"]) == len(inputs["open"]) == len(inputs["high"]) == len(inputs["low"]) == len(inputs["close"]) == len(inputs["volume"]), "Inputs must have the same length"
+        pair, freq = inputs["pair"], inputs["frequency"]
+        pair = "".join(pair.split("/"))
+        pair = f"binance.{pair.upper()}" if "." not in pair else pair
         timestamps = torch.tensor(inputs["timestamps"])
         samples = torch.tensor([inputs["open"], inputs["high"], inputs["low"], inputs["close"], inputs["volume"]], dtype=torch.float32).T.contiguous()
         steps = data.pop("steps", 4)
             torch.cuda.manual_seed(seed)
         # Generate scenarios
         samples = samples[-self.max_candles + steps:] # Keep only the last candles that fit in the model's context
+        tokens = self.tokenizer.encode(pair, freq, samples) # Encode the samples into tokens
         tokens = tokens.to(self.device).unsqueeze(0).long() # Add a batch dimension
         with torch.no_grad(), self.ctx:
             for _ in range(steps * self.tokenizer.tokens_per_candle):
                 # append sampled index to the running sequence and continue
                 tokens = torch.cat((tokens, next_tokens), dim=1)
         # Decode the tokens back into samples
+        _, _, scenarios = self.tokenizer.decode(tokens)
+        scenarios = scenarios[:, -steps:]
         print(f"Generated {n_scenarios} scenarios in {time.time() - t_start:.2f} seconds ⏱")
         return {
             "timestamps": (timestamps[-1] + torch.arange(1, steps+1) * torch.median(torch.diff(timestamps)).item()).tolist(),
         data = pd.read_csv(f)
     y = handler({
         "inputs": {
+            "pair": "binance.BTCUSDT",
+            "frequency": "1m",
             "timestamps": data[data.columns[0]].tolist(),
             "open": data[data.columns[1]].tolist(),
             "high": data[data.columns[2]].tolist(),