Update model (ChatTS-14B-0801) for better reasoning and Chinese QA ability.

Update model (ChatTS-14B-0801) for better reasoning and Chinese QA ability.

This new model introduce some minor bug fixes and code changes. We add position_embedding to the TimeSeriesEmbedding for better representation of positions. We have also retrained the model and added more reasoning and Chinese entries in the training dataset for better reasoning and Chinese capabilities.

If you want to reproduce the results in the paper, please download the old version of ChatTS-14B. This new model has almost the same evaluation results in terms of categorical metrics and better results in terms of statistical and reasoning metrics, compared with the old model.

Files changed (14) hide show

config.json +8 -5
configuration_qwen2.py +1 -5
generation_config.json +1 -1
modeling_qwen2.py +110 -13
processing_qwen2_ts.py +1 -1
processor_config.json +6 -0
pytorch_model-00001-of-00006.bin +1 -1
pytorch_model-00002-of-00006.bin +1 -1
pytorch_model-00003-of-00006.bin +1 -1
pytorch_model-00004-of-00006.bin +1 -1
pytorch_model-00005-of-00006.bin +1 -1
pytorch_model-00006-of-00006.bin +2 -2
pytorch_model.bin.index.json +3 -2
tokenizer_config.json +5 -1

config.json CHANGED Viewed

@@ -1,5 +1,4 @@
 {
-  "_name_or_path": "chatts_release",
   "architectures": [
     "Qwen2TSForCausalLM"
   ],
@@ -19,7 +18,7 @@
   "intermediate_size": 13824,
   "max_position_embeddings": 32768,
   "max_window_layers": 70,
-  "model_type": "chatts",
   "num_attention_heads": 40,
   "num_hidden_layers": 48,
   "num_key_value_heads": 8,
@@ -29,13 +28,17 @@
   "sliding_window": 131072,
   "tie_word_embeddings": false,
   "torch_dtype": "float16",
-  "transformers_version": "4.46.2",
   "ts": {
     "hidden_size": 5120,
     "num_features": 2,
     "num_layers": 5,
-    "patch_size": 16,
-    "max_length": 2048
   },
   "ts_token_end_index": 151666,
   "ts_token_start_index": 151665,

 {
   "architectures": [
     "Qwen2TSForCausalLM"
   ],
   "intermediate_size": 13824,
   "max_position_embeddings": 32768,
   "max_window_layers": 70,
+  "model_type": "qwen2",
   "num_attention_heads": 40,
   "num_hidden_layers": 48,
   "num_key_value_heads": 8,
   "sliding_window": 131072,
   "tie_word_embeddings": false,
   "torch_dtype": "float16",
+  "transformers_version": "4.52.4",
   "ts": {
+    "embedding_dim": 16,
     "hidden_size": 5120,
+    "max_length": 32768,
+    "max_sequence_length": 32768,
     "num_features": 2,
     "num_layers": 5,
+    "patch_size": 8,
+    "use_position_embedding": true,
+    "use_position_idx": false
   },
   "ts_token_end_index": 151666,
   "ts_token_start_index": 151665,

configuration_qwen2.py CHANGED Viewed

@@ -1,5 +1,4 @@
 # coding=utf-8
-# The following code are reused from the QWen project (https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) of Alibaba Cloud.
 # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,9 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# The code is modified by ByteDance and Tsinghua University from the original implementation of Qwen:
-# - We changed Qwen2Config to Qwen2TSConfig to support time series modeling.
 """ Qwen2 model configuration"""
 from transformers import PretrainedConfig
@@ -93,7 +89,7 @@ class Qwen2TSConfig(PretrainedConfig):
     >>> configuration = model.config
     ```"""
-    model_type = "chatts"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

 # coding=utf-8
 # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Qwen2 model configuration"""
 from transformers import PretrainedConfig
     >>> configuration = model.config
     ```"""
+    model_type = "qwen2"
     keys_to_ignore_at_inference = ["past_key_values"]
     def __init__(

generation_config.json CHANGED Viewed

@@ -10,5 +10,5 @@
   "temperature": 0.7,
   "top_k": 20,
   "top_p": 0.8,
-  "transformers_version": "4.46.2"
 }

   "temperature": 0.7,
   "top_k": 20,
   "top_p": 0.8,
+  "transformers_version": "4.52.4"
 }

modeling_qwen2.py CHANGED Viewed

@@ -106,6 +106,7 @@ class Qwen2TSCausalLMOutputWithPast(ModelOutput):
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     attention_mask: Optional[torch.FloatTensor] = None
 ########################Naive TS Embedding#####################
 class TimeSeriesEmbedding(nn.Module):
     def __init__(self, config):
@@ -114,10 +115,23 @@ class TimeSeriesEmbedding(nn.Module):
         self.num_layers = config['num_layers']
         self.hidden_size = config['hidden_size']
         self.num_features = config['num_features']
         layers = []
-        input_size = 1 * self.patch_size
         for _ in range(self.num_layers - 1):
             layers.append(nn.Linear(input_size, self.hidden_size))
             layers.append(nn.GELU())
@@ -130,30 +144,100 @@ class TimeSeriesEmbedding(nn.Module):
         batch_size = x.size(0)
         x = x.reshape(batch_size, -1, self.num_features)
         mask = x[:, :, -1].long()
-        valid_lengths = mask.sum(dim=1).long()  # Shape: (batch_size)
-        patch_cnt = (valid_lengths + self.patch_size - 1) // self.patch_size  # 向上取整
         patches_list = []
         for i in range(batch_size):
             vl = valid_lengths[i].item()
             pc = patch_cnt[i].item()
             if pc == 0:
                 continue
-            xi = x[i, :vl, :1]
             total_padded_length = pc * self.patch_size
             padding_length = total_padded_length - vl
             if padding_length > 0:
-                padding = torch.zeros(padding_length, 1, device=x.device, dtype=x.dtype)
                 xi = torch.cat([xi, padding], dim=0)
-            xi = xi.reshape(pc, self.patch_size * 1)
-            patches_list.append(xi)
         if patches_list:
-            x_patches = torch.cat(patches_list, dim=0)  # Shape: (total_patch_cnt, patch_size * num_features)
             x = self.mlp(x_patches)
         else:
             x = torch.empty(0, self.hidden_size, device=x.device)
         return x, patch_cnt
@@ -1389,6 +1473,17 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1401,12 +1496,14 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
             if timeseries is not None and timeseries.shape[0] > 0:
                 # use_cache = False
                 ts_features, patch_cnt = self.ts_encoder(timeseries)
                 inputs_embeds = inputs_embeds.to(ts_features.dtype)
                 inputs_embeds, attention_mask, position_ids, labels = self._merge_input_ids_with_time_series_features(
                     ts_features, inputs_embeds, input_ids, attention_mask, labels, patch_cnt
                 )
         outputs = self.model(
             attention_mask=attention_mask,
@@ -1700,4 +1797,4 @@ class Qwen2ForSequenceClassification(Qwen2PreTrainedModel):
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
-        )

     attentions: Optional[Tuple[torch.FloatTensor]] = None
     attention_mask: Optional[torch.FloatTensor] = None
 ########################Naive TS Embedding#####################
 class TimeSeriesEmbedding(nn.Module):
     def __init__(self, config):
         self.num_layers = config['num_layers']
         self.hidden_size = config['hidden_size']
         self.num_features = config['num_features']
+        self.max_sequence_length = config['max_sequence_length']  # Maximum time series length
+        self.use_position_embedding = config.get('use_position_embedding', False)
+        self.use_position_idx = config.get('use_position_idx', False)
+        self.embedding_dim = config.get('embedding_dim', 16)  # Embedding dimension
+        if self.use_position_embedding:
+            # Extended vocabulary: [0, max_sequence_length) for real positions, max_sequence_length for padding
+            self.position_embedding = nn.Embedding(self.max_sequence_length + 1, self.embedding_dim)
+            self.padding_idx = self.max_sequence_length  # Special index for padding
+            input_size = 1 * self.patch_size + self.embedding_dim * self.patch_size
+        elif self.use_position_idx:
+            input_size = 2 * self.patch_size
+        else:
+            input_size = 1 * self.patch_size
+        # Build MLP layers
         layers = []
         for _ in range(self.num_layers - 1):
             layers.append(nn.Linear(input_size, self.hidden_size))
             layers.append(nn.GELU())
         batch_size = x.size(0)
         x = x.reshape(batch_size, -1, self.num_features)
+        # Extract mask and calculate valid lengths
         mask = x[:, :, -1].long()
+        valid_lengths = mask.sum(dim=1).long()
+        patch_cnt = (valid_lengths + self.patch_size - 1) // self.patch_size
         patches_list = []
+        # Collect position indices for batch embedding lookup
+        all_position_indices = []
+        patch_info_list = []  # Store metadata for each patch group
         for i in range(batch_size):
             vl = valid_lengths[i].item()
             pc = patch_cnt[i].item()
             if pc == 0:
                 continue
+            # Extract time series data (excluding mask)
+            xi = x[i, :vl, :1]  # Time-series data
             total_padded_length = pc * self.patch_size
             padding_length = total_padded_length - vl
+            # Create position indices: real positions for actual data, special index for padding
+            position_indices = torch.arange(vl, device=x.device)
             if padding_length > 0:
+                # Pad with last value
+                last_value = xi[-1:, :]
+                padding = last_value.repeat(padding_length, 1)
                 xi = torch.cat([xi, padding], dim=0)
+                # Use special padding index for padding positions
+                padding_positions = torch.full((padding_length,), self.padding_idx, device=x.device)
+                position_indices = torch.cat([position_indices, padding_positions], dim=0)
+            # Reshape to patches
+            xi = xi.reshape(pc, self.patch_size)  # (num_patches, patch_size)
+            position_indices = position_indices.reshape(pc, self.patch_size)  # (num_patches, patch_size)
+            if self.use_position_embedding:
+                # Collect position indices instead of calling embedding immediately
+                all_position_indices.append(position_indices)
+                patch_info_list.append({
+                    'xi': xi,
+                    'pc': pc,
+                    'sample_idx': i
+                })
+            elif self.use_position_idx:
+                # Normalize position indices
+                pos_indices = torch.arange(vl, device=x.device).unsqueeze(1)
+                pos_indices = pos_indices / max(1, valid_lengths.max().item() - 1)
+                if padding_length > 0:
+                    # Use -1 for padding positions
+                    padding_indices = torch.full((padding_length, 1), -1, device=x.device)
+                    pos_indices = torch.cat([pos_indices, padding_indices], dim=0)
+                # Combine time series data with position indices
+                xi_combined = torch.cat([xi.reshape(-1, 1), pos_indices], dim=1)
+                patch_input = xi_combined.reshape(pc, self.patch_size * 2)
+                patches_list.append(patch_input)
+            else:
+                # No position embedding, use raw patches
+                patch_input = xi
+                patches_list.append(patch_input)
+        # Batch process position embeddings if needed
+        if self.use_position_embedding and all_position_indices:
+            # Concatenate all position indices for batch embedding lookup
+            batch_position_indices = torch.cat(all_position_indices, dim=0)
+            # print(f"{x.shape=}, {x.device=}, {len(all_position_indices)=}, {batch_position_indices=}")
+            batch_pos_emb = self.position_embedding(batch_position_indices)  # Single embedding call
+            # Split embeddings back and create patch inputs
+            emb_start_idx = 0
+            for patch_info in patch_info_list:
+                xi = patch_info['xi']
+                pc = patch_info['pc']
+                # Extract corresponding embeddings
+                pos_emb = batch_pos_emb[emb_start_idx:emb_start_idx + pc]
+                emb_start_idx += pc
+                # Flatten and concatenate
+                xi = xi.unsqueeze(-1)  # (num_patches, patch_size, 1)
+                patch_input = torch.cat([
+                    xi.flatten(1),  # (num_patches, patch_size)
+                    pos_emb.flatten(1)  # (num_patches, patch_size * embedding_dim)
+                ], dim=1)
+                patches_list.append(patch_input)
+        # Process all patches through MLP
         if patches_list:
+            x_patches = torch.cat(patches_list, dim=0)
             x = self.mlp(x_patches)
         else:
+            # Handle empty case
             x = torch.empty(0, self.hidden_size, device=x.device)
         return x, patch_cnt
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
+        # if input_ids is not None and timeseries is not None:
+        #     # Batch decode the input
+        #     input_text = self.tokenizer.batch_decode(input_ids, skip_special_tokens=False)
+        #     # Print the input text
+        #     print("=================================================================")
+        #     print("Input text:", input_text)
+        #     print("Timeseries shape:", timeseries.shape)
+        #     print("=================================================================\n\n")
+        # else:
+        #     print("Time series is None!!!!")
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             if timeseries is not None and timeseries.shape[0] > 0:
                 # use_cache = False
+                # print(f"timeseries shape: {timeseries.shape=}, {input_ids.shape=}")
                 ts_features, patch_cnt = self.ts_encoder(timeseries)
                 inputs_embeds = inputs_embeds.to(ts_features.dtype)
                 inputs_embeds, attention_mask, position_ids, labels = self._merge_input_ids_with_time_series_features(
                     ts_features, inputs_embeds, input_ids, attention_mask, labels, patch_cnt
                 )
+                # print(f"{inputs_embeds.shape=}, {attention_mask.shape=}, {position_ids.shape=}, {labels.shape=}")
         outputs = self.model(
             attention_mask=attention_mask,
             past_key_values=transformer_outputs.past_key_values,
             hidden_states=transformer_outputs.hidden_states,
             attentions=transformer_outputs.attentions,
+        )

processing_qwen2_ts.py CHANGED Viewed

@@ -41,7 +41,7 @@ def sp_encoding(timeseries: np.ndarray, eots_token: bool = True) -> Tuple[np.nda
         scale_factor = np.max(np.abs(scaled_timeseries)) / 3.0
         scaled_timeseries /= scale_factor
-    prompt = f"[Value Offset: {-mean:.4f}|Value Scaling: {scale_factor:.4f}]<ts>"
     if eots_token:
         prompt += '<ts/>'

         scale_factor = np.max(np.abs(scaled_timeseries)) / 3.0
         scaled_timeseries /= scale_factor
+    prompt = f"[offset={-mean:.4f}|scaling={scale_factor:.4f}|length={len(timeseries)}|max={max(timeseries):.4f}|min={min(timeseries):.4f}|left={timeseries[0]:.4f}|right={timeseries[-1]:.4f}]<ts>"
     if eots_token:
         prompt += '<ts/>'

processor_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_qwen2_ts.Qwen2TSProcessor"
+  },
+  "processor_class": "Qwen2TSProcessor"
+}

pytorch_model-00001-of-00006.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:63c25443a5dcc0c92e5bb68a831cb67a813a7706540f3d1fbe711d3cf7a70a93
 size 4986229446

 version https://git-lfs.github.com/spec/v1
+oid sha256:9fe54d5f99a2398419df0b9f4f8c48f1db148b3a074b28024569751fa99780a6
 size 4986229446

pytorch_model-00002-of-00006.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ecd0aa0c77e0dfeb8f6c0926b69c720e8609b47eee0bdb608826e0535fa6c47b
 size 4954871698

 version https://git-lfs.github.com/spec/v1
+oid sha256:aab4c20520e32cfce07b6b860c8a72b1ef11e02f187fbf3d054ebee67a2c67bd
 size 4954871698

pytorch_model-00003-of-00006.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8a492dee1b517cc92bb0337b7c97b6874cd921702be7444b4187a10f12c57461
 size 4954871762

 version https://git-lfs.github.com/spec/v1
+oid sha256:c46a806289c2b53ab2158a06c3c2d6fb99a8be5e6061e5434976c79129df29d3
 size 4954871762

pytorch_model-00004-of-00006.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ddc0eaa57aa822465e3f2042e1dcb610c40647acdadff31128fc9e1c63c0d857
 size 4954871762

 version https://git-lfs.github.com/spec/v1
+oid sha256:e038cc5ed0ba96941ae800272311084affa493262cde6a20fad1b6ad8a253610
 size 4954871762

pytorch_model-00005-of-00006.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:30a70d67b7d4e721c815f214660104a9af0165ad480da3c64e39bfaa45005002
 size 4954871762

 version https://git-lfs.github.com/spec/v1
+oid sha256:db8502c0a92a3545639afbb260c529efcd0733f5b972163462114c08dc5386dd
 size 4954871762

pytorch_model-00006-of-00006.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:595a34f67930943620dff903b810511e947ecab7891dea37ad8d9433c73367e6
-size 4944481872

 version https://git-lfs.github.com/spec/v1
+oid sha256:1b28791c5b49a4899902a7e079460d7a41e4377214ad18f6f5b88b1d53a40394
+size 4946759662

pytorch_model.bin.index.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "metadata": {
-    "total_size": 29749997568
   },
   "weight_map": {
     "lm_head.weight": "pytorch_model-00006-of-00006.bin",
@@ -591,6 +591,7 @@
     "ts_encoder.mlp.6.bias": "pytorch_model-00006-of-00006.bin",
     "ts_encoder.mlp.6.weight": "pytorch_model-00006-of-00006.bin",
     "ts_encoder.mlp.8.bias": "pytorch_model-00006-of-00006.bin",
-    "ts_encoder.mlp.8.weight": "pytorch_model-00006-of-00006.bin"
   }
 }

 {
   "metadata": {
+    "total_size": 29752274976
   },
   "weight_map": {
     "lm_head.weight": "pytorch_model-00006-of-00006.bin",
     "ts_encoder.mlp.6.bias": "pytorch_model-00006-of-00006.bin",
     "ts_encoder.mlp.6.weight": "pytorch_model-00006-of-00006.bin",
     "ts_encoder.mlp.8.bias": "pytorch_model-00006-of-00006.bin",
+    "ts_encoder.mlp.8.weight": "pytorch_model-00006-of-00006.bin",
+    "ts_encoder.position_embedding.weight": "pytorch_model-00006-of-00006.bin"
   }
 }

tokenizer_config.json CHANGED Viewed

@@ -199,14 +199,18 @@
     "<ts>",
     "<ts/>"
   ],
   "bos_token": null,
-  "chat_template": "{% set system_message = 'You are a helpful assistant.' %}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\n' }}{% endif %}{% endfor %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "errors": "replace",
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null

     "<ts>",
     "<ts/>"
   ],
+  "auto_map": {
+    "AutoProcessor": "processing_qwen2_ts.Qwen2TSProcessor"
+  },
   "bos_token": null,
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|im_end|>",
   "errors": "replace",
+  "extra_special_tokens": {},
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
+  "processor_class": "Qwen2TSProcessor",
   "split_special_tokens": false,
   "tokenizer_class": "Qwen2Tokenizer",
   "unk_token": null