ik commited on
Commit
1cc592d
·
verified ·
1 Parent(s): f15136a

Upload averaged RVQ (2025-09-02)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ twi_semantic_tokens_min.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ language:
4
+ - tw
5
+ - ak
6
+ library_name: pytorch
7
+ tags:
8
+ - speechless
9
+ - rvq
10
+ - whisper
11
+ - twi
12
+ - akan
13
+ - vector-quantization
14
+ - semantic-tokens
15
+ ---
16
+
17
+ # Speechless TWI — Stage 1 (RVQ for Whisper Encoder)
18
+
19
+ Trained RVQ that discretizes Whisper encoder features into semantic tokens for **Twi/Akan**.
20
+
21
+ ## Files
22
+ - `rvq_final.pt` — state dict
23
+ - `config_stage1.json` — training/config params
24
+ - `rvq_wrapper.py` — tiny module defining `RVQWrapper`
25
+
26
+ ## Usage (example)
27
+ ```python
28
+ import torch, json
29
+ from huggingface_hub import hf_hub_download
30
+ from rvq_wrapper import RVQWrapper
31
+
32
+ cfg = json.load(open(hf_hub_download("ik/speechless-twi-stage1-rvq-whisper-medium", "config_stage1.json"), "r"))
33
+ ckpt = torch.load(hf_hub_download("ik/speechless-twi-stage1-rvq-whisper-medium", "rvq_final.pt"), map_location="cpu")
34
+
35
+ rvq = RVQWrapper(cfg["rvq_dim"], cfg["rvq_num_quantizers"], cfg["rvq_codebook_size"])
36
+ rvq.load_state_dict(ckpt["rvq"])
37
+ rvq.eval()
38
+ ```
config_stage1.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cv_version": "fsicoli/twi",
3
+ "cv_lang": "tw",
4
+ "split_train": "train",
5
+ "split_eval": "validation",
6
+ "sample_rate": 16000,
7
+ "max_audio_seconds": 30.0,
8
+ "whisper_ckpt": "femursmith/intermediate-asr-ashanti-twi",
9
+ "batch_size": 2,
10
+ "num_workers": 0,
11
+ "rvq_dim": 1280,
12
+ "rvq_num_quantizers": 12,
13
+ "rvq_codebook_size": 2048,
14
+ "rvq_commitment_weight": 0.5,
15
+ "lr": 0.0003,
16
+ "epochs": 3,
17
+ "warmup_steps": 200,
18
+ "save_every": 1000,
19
+ "ckpt_file": "rvq_averaged.pt",
20
+ "ckpt_note": "averaged from multiple checkpoints"
21
+ }
rvq_averaged.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88826673b566b95f637e478b66ff6243bce7b9c374b3d00df99e55ac7ad56213
3
+ size 264990123
rvq_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a3d66420f6aa832bb656a820e81b10fa87717aec738202670ac99181886c43b
3
+ size 264993023
rvq_final.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44789a82b97a1636437c097340141b3a6470352e7d486e04d3b068dce98d9c21
3
+ size 264993082
rvq_step1000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:798b246a2623d74e4054979e4c3f5465e8a5bd6d544fe35c34c721a42628bca8
3
+ size 264993259
rvq_step2000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f89dee7a6f40c0790455252d1d31264769b1debef6b5616d75624e998a8fd6b7
3
+ size 264993259
rvq_step3000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d154b844a11b5fea028aa664a0644f1af8e616cfdc6e34f0d45c05c32d3d2ffe
3
+ size 264993259
rvq_wrapper.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import torch, torch.nn as nn
3
+ from vector_quantize_pytorch import ResidualVQ
4
+
5
+ class RVQWrapper(nn.Module):
6
+ def __init__(self, dim, num_quantizers, codebook_size):
7
+ super().__init__()
8
+ self.ln_in = nn.LayerNorm(dim)
9
+ self.proj_in = nn.Linear(dim, dim)
10
+ self.rvq = ResidualVQ(dim=dim, num_quantizers=num_quantizers, codebook_size=codebook_size)
11
+ self.ln_out = nn.LayerNorm(dim)
12
+ self.proj_out = nn.Linear(dim, dim)
13
+
14
+ def forward(self, x):
15
+ x = self.proj_in(self.ln_in(x))
16
+ q, indices, commit_loss = self.rvq(x)
17
+ y = self.proj_out(self.ln_out(q))
18
+ return y, indices, commit_loss
twi_semantic_tokens_min.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c094d7e334c0da342bac1470459f7e4fb364ff1a9c97e6795a8b81fcc7194be7
3
+ size 11269340