eustlb HF Staff commited on
Commit
8db7cd3
·
1 Parent(s): 5be685f

transformers integration

Browse files
README.md CHANGED
@@ -51,6 +51,79 @@ The predicted text token timestamps can be recovered by subtracting the model's
51
 
52
  See the [GitHub repository](https://github.com/kyutai-labs/delayed-streams-modeling/).
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  ## Training Details
55
 
56
  ### Training Data
 
51
 
52
  See the [GitHub repository](https://github.com/kyutai-labs/delayed-streams-modeling/).
53
 
54
+ ### Use with transformers
55
+
56
+ Install transformers from source:
57
+ ```bash
58
+ pip install git+https://github.com/huggingface/transformers
59
+ ```
60
+
61
+ Inference:
62
+ ```python
63
+ import torch
64
+ from datasets import load_dataset, Audio
65
+ from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
66
+
67
+ # 1. load the model and the processor
68
+ torch_device = "cuda" if torch.cuda.is_available() else "cpu"
69
+ model_id = "kyutai/stt-2.6b-en_fr"
70
+
71
+ processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
72
+ model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
73
+
74
+ # 2. load audio samples
75
+ ds = load_dataset(
76
+ "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
77
+ )
78
+ ds = ds.cast_column("audio", Audio(sampling_rate=24000))
79
+
80
+ # 3. prepare the model inputs
81
+ inputs = processor(
82
+ ds[0]["audio"]["array"],
83
+ )
84
+ inputs.to(torch_device)
85
+
86
+ # 4. infer the model
87
+ output_tokens = model.generate(**inputs)
88
+
89
+ # 5. decode the generated tokens
90
+ print(processor.batch_decode(output_tokens, skip_special_tokens=True))
91
+ ```
92
+
93
+ Batched inference:
94
+ ```python
95
+ import torch
96
+ from datasets import load_dataset, Audio
97
+ from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
98
+
99
+ # 1. load the model and the processor
100
+ torch_device = "cuda" if torch.cuda.is_available() else "cpu"
101
+ model_id = "kyutai/stt-2.6b-en_fr"
102
+
103
+ processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
104
+ model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
105
+
106
+ # 2. load audio samples
107
+ ds = load_dataset(
108
+ "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
109
+ )
110
+ ds = ds.cast_column("audio", Audio(sampling_rate=24000))
111
+
112
+ # 3. prepare the model inputs
113
+ audio_arrays = [ds[i]["audio"]["array"] for i in range(4)]
114
+ inputs = processor(audio_arrays, return_tensors="pt", padding=True)
115
+ inputs = inputs.to(torch_device)
116
+
117
+ # 4. infer the model
118
+ output_tokens = model.generate(**inputs)
119
+
120
+ # 5. decode the generated tokens
121
+ decoded_outputs = processor.batch_decode(output_tokens, skip_special_tokens=True)
122
+ for output in decoded_outputs:
123
+ print(output)
124
+ ```
125
+ ####
126
+
127
  ## Training Details
128
 
129
  ### Training Data
config.json CHANGED
@@ -75,5 +75,80 @@
75
  },
76
  "model_type": "stt",
77
  "mimi_name": "[email protected]",
78
- "tokenizer_name": "tokenizer_en_fr_audio_8000.model"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  }
 
75
  },
76
  "model_type": "stt",
77
  "mimi_name": "[email protected]",
78
+ "tokenizer_name": "tokenizer_en_fr_audio_8000.model",
79
+ "architectures": [
80
+ "KyutaiSpeechToTextForConditionalGeneration"
81
+ ],
82
+ "attention_dropout": 0.0,
83
+ "audio_bos_token_id": 2048,
84
+ "audio_pad_token_id": 69569,
85
+ "bos_token_id": 48000,
86
+ "codebook_vocab_size": 2049,
87
+ "codec_config": {
88
+ "_frame_rate": null,
89
+ "attention_bias": false,
90
+ "attention_dropout": 0.0,
91
+ "audio_channels": 1,
92
+ "codebook_dim": 256,
93
+ "codebook_size": 2048,
94
+ "compress": 2,
95
+ "dilation_growth_rate": 2,
96
+ "head_dim": 64,
97
+ "hidden_act": "gelu",
98
+ "hidden_size": 512,
99
+ "initializer_range": 0.02,
100
+ "intermediate_size": 2048,
101
+ "kernel_size": 7,
102
+ "last_kernel_size": 3,
103
+ "layer_scale_initial_scale": 0.01,
104
+ "max_position_embeddings": 8000,
105
+ "model_type": "mimi",
106
+ "norm_eps": 1e-05,
107
+ "num_attention_heads": 8,
108
+ "num_filters": 64,
109
+ "num_hidden_layers": 8,
110
+ "num_key_value_heads": 8,
111
+ "num_quantizers": 32,
112
+ "num_residual_layers": 1,
113
+ "num_semantic_quantizers": 1,
114
+ "pad_mode": "constant",
115
+ "residual_kernel_size": 3,
116
+ "rope_theta": 10000.0,
117
+ "sampling_rate": 24000,
118
+ "sliding_window": 250,
119
+ "trim_right_ratio": 1.0,
120
+ "upsample_groups": 512,
121
+ "upsampling_ratios": [
122
+ 8,
123
+ 6,
124
+ 5,
125
+ 4
126
+ ],
127
+ "use_cache": false,
128
+ "use_causal_conv": true,
129
+ "use_conv_shortcut": false,
130
+ "use_streaming": false,
131
+ "vector_quantization_hidden_dimension": 256
132
+ },
133
+ "ffn_dim": 11264,
134
+ "frame_size": 1920,
135
+ "head_dim": 128,
136
+ "hidden_act": "silu",
137
+ "hidden_size": 2048,
138
+ "initializer_range": 0.02,
139
+ "max_position_embeddings": 375,
140
+ "num_attention_heads": 16,
141
+ "num_codebooks": 32,
142
+ "num_hidden_layers": 16,
143
+ "num_key_value_heads": 16,
144
+ "pad_token_id": 3,
145
+ "rms_norm_eps": 1e-08,
146
+ "rope_theta": 100000.0,
147
+ "sliding_window": 375,
148
+ "tie_word_embeddings": false,
149
+ "torch_dtype": "bfloat16",
150
+ "transformers_version": "4.53.0.dev0",
151
+ "use_cache": true,
152
+ "vocab_size": 8001,
153
+ "transformers_weights": "transformers.safetensors"
154
  }
generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_window_size": 1,
3
+ "bos_token_id": 48000,
4
+ "cache_implementation": "sliding_window",
5
+ "codec_cache_implementation": "sliding_window",
6
+ "pad_token_id": 3,
7
+ "transformers_version": "4.53.0.dev0"
8
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_delay_seconds": 0.5,
3
+ "audio_silence_prefix_seconds": 0.0,
4
+ "chunk_length_s": null,
5
+ "feature_extractor_type": "KyutaiSpeechToTextFeatureExtractor",
6
+ "feature_size": 1,
7
+ "overlap": null,
8
+ "padding_side": "right",
9
+ "padding_value": 0.0,
10
+ "processor_class": "KyutaiSpeechToTextProcessor",
11
+ "return_attention_mask": true,
12
+ "sampling_rate": 24000
13
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "unk_token": "<unk>"
3
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<pad>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ }
35
+ },
36
+ "bos_token_id": null,
37
+ "chat_template": null,
38
+ "clean_up_tokenization_spaces": false,
39
+ "eos_token_id": null,
40
+ "extra_special_tokens": {},
41
+ "model_input_names": [
42
+ "input_ids",
43
+ "attention_mask"
44
+ ],
45
+ "model_max_length": 1000000000000000019884624838656,
46
+ "pad_token_id": null,
47
+ "processor_class": "KyutaiSpeechToTextProcessor",
48
+ "tokenizer_class": "PreTrainedTokenizerFast",
49
+ "unk_token": "<unk>"
50
+ }
transformers.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59a6da960020ea4e1436118c63d7cd73f013e0c62466de12d7ad4b64454fd035
3
+ size 2697201444