Spaces:
Sleeping
Sleeping
update
Browse files- toolbox/k2_sherpa/examples.py +32 -0
- toolbox/k2_sherpa/nn_models.py +119 -53
toolbox/k2_sherpa/examples.py
CHANGED
@@ -19,4 +19,36 @@ examples = [
|
|
19 |
"Yes",
|
20 |
"./data/test_wavs/librispeech/1089-134686-0001.wav",
|
21 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
]
|
|
|
19 |
"Yes",
|
20 |
"./data/test_wavs/librispeech/1089-134686-0001.wav",
|
21 |
],
|
22 |
+
[
|
23 |
+
"Chinese+English",
|
24 |
+
"csukuangfj/sherpa-onnx-streaming-zipformer-bilingual-zh-en-2023-02-20",
|
25 |
+
"greedy_search",
|
26 |
+
4,
|
27 |
+
"Yes",
|
28 |
+
"./data/test_wavs/tal_csasr/0.wav",
|
29 |
+
],
|
30 |
+
[
|
31 |
+
"Chinese+English+Cantonese",
|
32 |
+
"csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en",
|
33 |
+
"greedy_search",
|
34 |
+
4,
|
35 |
+
"Yes",
|
36 |
+
"./data/test_wavs/cantonese/2.wav",
|
37 |
+
],
|
38 |
+
[
|
39 |
+
"Cantonese",
|
40 |
+
"zrjin/icefall-asr-mdcc-zipformer-2024-03-11",
|
41 |
+
"greedy_search",
|
42 |
+
4,
|
43 |
+
"Yes",
|
44 |
+
"./data/test_wavs/cantonese/1.wav",
|
45 |
+
],
|
46 |
+
[
|
47 |
+
"Tibetan",
|
48 |
+
"syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
|
49 |
+
"greedy_search",
|
50 |
+
4,
|
51 |
+
"No",
|
52 |
+
"./data/test_wavs/tibetan/a_0_cacm-A70_31117.wav",
|
53 |
+
],
|
54 |
]
|
toolbox/k2_sherpa/nn_models.py
CHANGED
@@ -65,7 +65,7 @@ model_map = {
|
|
65 |
"joiner_model_file_sub_folder": ".",
|
66 |
"tokens_file": "tokens.txt",
|
67 |
"tokens_file_sub_folder": ".",
|
68 |
-
"loader": "
|
69 |
},
|
70 |
{
|
71 |
"repo_id": "zrjin/icefall-asr-aishell-zipformer-large-2023-10-24",
|
@@ -77,7 +77,7 @@ model_map = {
|
|
77 |
"joiner_model_file_sub_folder": "exp",
|
78 |
"tokens_file": "tokens.txt",
|
79 |
"tokens_file_sub_folder": "data/lang_char",
|
80 |
-
"loader": "
|
81 |
},
|
82 |
{
|
83 |
"repo_id": "zrjin/icefall-asr-aishell-zipformer-small-2023-10-24",
|
@@ -89,7 +89,7 @@ model_map = {
|
|
89 |
"joiner_model_file_sub_folder": "exp",
|
90 |
"tokens_file": "tokens.txt",
|
91 |
"tokens_file_sub_folder": "data/lang_char",
|
92 |
-
"loader": "
|
93 |
},
|
94 |
{
|
95 |
"repo_id": "zrjin/icefall-asr-aishell-zipformer-2023-10-24",
|
@@ -101,7 +101,7 @@ model_map = {
|
|
101 |
"joiner_model_file_sub_folder": "exp",
|
102 |
"tokens_file": "tokens.txt",
|
103 |
"tokens_file_sub_folder": "data/lang_char",
|
104 |
-
"loader": "
|
105 |
},
|
106 |
{
|
107 |
"repo_id": "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
|
@@ -158,7 +158,7 @@ model_map = {
|
|
158 |
"decoder_model_file_sub_folder": ".",
|
159 |
"tokens_file": "tiny.en-tokens.txt",
|
160 |
"tokens_file_sub_folder": ".",
|
161 |
-
"loader": "
|
162 |
},
|
163 |
{
|
164 |
"repo_id": "csukuangfj/sherpa-onnx-whisper-base.en",
|
@@ -168,7 +168,7 @@ model_map = {
|
|
168 |
"decoder_model_file_sub_folder": ".",
|
169 |
"tokens_file": "base.en-tokens.txt",
|
170 |
"tokens_file_sub_folder": ".",
|
171 |
-
"loader": "
|
172 |
},
|
173 |
{
|
174 |
"repo_id": "csukuangfj/sherpa-onnx-whisper-small.en",
|
@@ -178,7 +178,7 @@ model_map = {
|
|
178 |
"decoder_model_file_sub_folder": ".",
|
179 |
"tokens_file": "small.en-tokens.txt",
|
180 |
"tokens_file_sub_folder": ".",
|
181 |
-
"loader": "
|
182 |
},
|
183 |
{
|
184 |
"repo_id": "csukuangfj/sherpa-onnx-paraformer-en-2024-03-09",
|
@@ -198,7 +198,7 @@ model_map = {
|
|
198 |
"joiner_model_file_sub_folder": "exp",
|
199 |
"tokens_file": "tokens.txt",
|
200 |
"tokens_file_sub_folder": "data/lang_bpe_500",
|
201 |
-
"loader": "
|
202 |
},
|
203 |
{
|
204 |
"repo_id": "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
|
@@ -332,7 +332,7 @@ model_map = {
|
|
332 |
"joiner_model_file_sub_folder": ".",
|
333 |
"tokens_file": "tokens.txt",
|
334 |
"tokens_file_sub_folder": ".",
|
335 |
-
"loader": "
|
336 |
},
|
337 |
{
|
338 |
"repo_id": "csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28",
|
@@ -359,16 +359,58 @@ model_map = {
|
|
359 |
"loader": "load_sherpa_offline_recognizer",
|
360 |
},
|
361 |
],
|
362 |
-
"Chinese+Cantonese
|
363 |
{
|
364 |
"repo_id": "csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en",
|
365 |
"nn_model_file": "model.int8.onnx",
|
366 |
"nn_model_file_sub_folder": ".",
|
367 |
"tokens_file": "tokens.txt",
|
368 |
"tokens_file_sub_folder": ".",
|
369 |
-
"loader": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
370 |
},
|
371 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
}
|
373 |
|
374 |
|
@@ -456,13 +498,13 @@ def load_sherpa_offline_recognizer(nn_model_file: str,
|
|
456 |
return recognizer
|
457 |
|
458 |
|
459 |
-
def
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
|
467 |
paraformer=nn_model_file,
|
468 |
tokens=tokens_file,
|
@@ -475,16 +517,16 @@ def load_sherpa_offline_recognizer_from_paraformer(nn_model_file: str,
|
|
475 |
return recognizer
|
476 |
|
477 |
|
478 |
-
def
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
|
489 |
encoder=encoder_model_file,
|
490 |
decoder=decoder_model_file,
|
@@ -499,11 +541,11 @@ def load_sherpa_offline_recognizer_from_transducer(encoder_model_file: str,
|
|
499 |
return recognizer
|
500 |
|
501 |
|
502 |
-
def
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
|
508 |
encoder=encoder_model_file,
|
509 |
decoder=decoder_model_file,
|
@@ -513,17 +555,17 @@ def load_sherpa_offline_recognizer_from_whisper(encoder_model_file: str,
|
|
513 |
return recognizer
|
514 |
|
515 |
|
516 |
-
def
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
recognizer = sherpa_onnx.
|
527 |
encoder=encoder_model_file,
|
528 |
decoder=decoder_model_file,
|
529 |
joiner=joiner_model_file,
|
@@ -537,6 +579,26 @@ def load_sherpa_online_recognizer_from_transducer(encoder_model_file: str,
|
|
537 |
return recognizer
|
538 |
|
539 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
540 |
def load_recognizer(local_model_dir: Path,
|
541 |
decoding_method: str = "greedy_search",
|
542 |
num_active_paths: int = 4,
|
@@ -577,22 +639,26 @@ def load_recognizer(local_model_dir: Path,
|
|
577 |
num_active_paths=num_active_paths,
|
578 |
**kwargs_
|
579 |
)
|
580 |
-
elif loader == "
|
581 |
-
recognizer =
|
582 |
decoding_method=decoding_method,
|
583 |
**kwargs_
|
584 |
)
|
585 |
-
elif loader == "
|
586 |
-
recognizer =
|
587 |
decoding_method=decoding_method,
|
588 |
**kwargs_
|
589 |
)
|
590 |
-
elif loader == "
|
591 |
-
recognizer =
|
|
|
|
|
|
|
|
|
592 |
**kwargs_
|
593 |
)
|
594 |
-
elif loader == "
|
595 |
-
recognizer =
|
596 |
**kwargs_
|
597 |
)
|
598 |
else:
|
|
|
65 |
"joiner_model_file_sub_folder": ".",
|
66 |
"tokens_file": "tokens.txt",
|
67 |
"tokens_file_sub_folder": ".",
|
68 |
+
"loader": "load_sherpa_onnx_offline_recognizer_from_transducer",
|
69 |
},
|
70 |
{
|
71 |
"repo_id": "zrjin/icefall-asr-aishell-zipformer-large-2023-10-24",
|
|
|
77 |
"joiner_model_file_sub_folder": "exp",
|
78 |
"tokens_file": "tokens.txt",
|
79 |
"tokens_file_sub_folder": "data/lang_char",
|
80 |
+
"loader": "load_sherpa_onnx_offline_recognizer_from_transducer",
|
81 |
},
|
82 |
{
|
83 |
"repo_id": "zrjin/icefall-asr-aishell-zipformer-small-2023-10-24",
|
|
|
89 |
"joiner_model_file_sub_folder": "exp",
|
90 |
"tokens_file": "tokens.txt",
|
91 |
"tokens_file_sub_folder": "data/lang_char",
|
92 |
+
"loader": "load_sherpa_onnx_offline_recognizer_from_transducer",
|
93 |
},
|
94 |
{
|
95 |
"repo_id": "zrjin/icefall-asr-aishell-zipformer-2023-10-24",
|
|
|
101 |
"joiner_model_file_sub_folder": "exp",
|
102 |
"tokens_file": "tokens.txt",
|
103 |
"tokens_file_sub_folder": "data/lang_char",
|
104 |
+
"loader": "load_sherpa_onnx_offline_recognizer_from_transducer",
|
105 |
},
|
106 |
{
|
107 |
"repo_id": "desh2608/icefall-asr-alimeeting-pruned-transducer-stateless7",
|
|
|
158 |
"decoder_model_file_sub_folder": ".",
|
159 |
"tokens_file": "tiny.en-tokens.txt",
|
160 |
"tokens_file_sub_folder": ".",
|
161 |
+
"loader": "load_sherpa_onnx_offline_recognizer_from_whisper",
|
162 |
},
|
163 |
{
|
164 |
"repo_id": "csukuangfj/sherpa-onnx-whisper-base.en",
|
|
|
168 |
"decoder_model_file_sub_folder": ".",
|
169 |
"tokens_file": "base.en-tokens.txt",
|
170 |
"tokens_file_sub_folder": ".",
|
171 |
+
"loader": "load_sherpa_onnx_offline_recognizer_from_whisper",
|
172 |
},
|
173 |
{
|
174 |
"repo_id": "csukuangfj/sherpa-onnx-whisper-small.en",
|
|
|
178 |
"decoder_model_file_sub_folder": ".",
|
179 |
"tokens_file": "small.en-tokens.txt",
|
180 |
"tokens_file_sub_folder": ".",
|
181 |
+
"loader": "load_sherpa_onnx_offline_recognizer_from_whisper",
|
182 |
},
|
183 |
{
|
184 |
"repo_id": "csukuangfj/sherpa-onnx-paraformer-en-2024-03-09",
|
|
|
198 |
"joiner_model_file_sub_folder": "exp",
|
199 |
"tokens_file": "tokens.txt",
|
200 |
"tokens_file_sub_folder": "data/lang_bpe_500",
|
201 |
+
"loader": "load_sherpa_onnx_offline_recognizer_from_transducer",
|
202 |
},
|
203 |
{
|
204 |
"repo_id": "wgb14/icefall-asr-gigaspeech-pruned-transducer-stateless2",
|
|
|
332 |
"joiner_model_file_sub_folder": ".",
|
333 |
"tokens_file": "tokens.txt",
|
334 |
"tokens_file_sub_folder": ".",
|
335 |
+
"loader": "load_sherpa_onnx_online_recognizer_from_transducer",
|
336 |
},
|
337 |
{
|
338 |
"repo_id": "csukuangfj/sherpa-onnx-paraformer-zh-2023-03-28",
|
|
|
359 |
"loader": "load_sherpa_offline_recognizer",
|
360 |
},
|
361 |
],
|
362 |
+
"Chinese+English+Cantonese": [
|
363 |
{
|
364 |
"repo_id": "csukuangfj/sherpa-onnx-paraformer-trilingual-zh-cantonese-en",
|
365 |
"nn_model_file": "model.int8.onnx",
|
366 |
"nn_model_file_sub_folder": ".",
|
367 |
"tokens_file": "tokens.txt",
|
368 |
"tokens_file_sub_folder": ".",
|
369 |
+
"loader": "load_sherpa_onnx_offline_recognizer_from_paraformer",
|
370 |
+
},
|
371 |
+
{
|
372 |
+
"repo_id": "csukuangfj/sherpa-onnx-streaming-paraformer-trilingual-zh-cantonese-en",
|
373 |
+
"encoder_model_file": "encoder.int8.onnx",
|
374 |
+
"encoder_model_file_sub_folder": ".",
|
375 |
+
"decoder_model_file": "decoder.int8.onnx",
|
376 |
+
"decoder_model_file_sub_folder": ".",
|
377 |
+
"tokens_file": "tokens.txt",
|
378 |
+
"tokens_file_sub_folder": ".",
|
379 |
+
"loader": "load_sherpa_onnx_online_recognizer_from_paraformer",
|
380 |
+
},
|
381 |
+
],
|
382 |
+
"Cantonese": [
|
383 |
+
{
|
384 |
+
"repo_id": "zrjin/icefall-asr-mdcc-zipformer-2024-03-11",
|
385 |
+
"encoder_model_file": "encoder-epoch-45-avg-35.int8.onnx",
|
386 |
+
"encoder_model_file_sub_folder": "exp",
|
387 |
+
"decoder_model_file": "decoder-epoch-45-avg-35.onnx",
|
388 |
+
"decoder_model_file_sub_folder": "exp",
|
389 |
+
"joiner_model_file": "joiner-epoch-45-avg-35.int8.onnx",
|
390 |
+
"joiner_model_file_sub_folder": "exp",
|
391 |
+
"tokens_file": "tokens.txt",
|
392 |
+
"tokens_file_sub_folder": "data/lang_char",
|
393 |
+
"loader": "load_sherpa_onnx_offline_recognizer_from_transducer",
|
394 |
+
},
|
395 |
+
],
|
396 |
+
"Tibetan": [
|
397 |
+
{
|
398 |
+
"repo_id": "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless7-2022-12-02",
|
399 |
+
"nn_model_file": "cpu_jit.pt",
|
400 |
+
"nn_model_file_sub_folder": "exp",
|
401 |
+
"tokens_file": "tokens.txt",
|
402 |
+
"tokens_file_sub_folder": "data/lang_bpe_500",
|
403 |
+
"loader": "load_sherpa_offline_recognizer",
|
404 |
},
|
405 |
+
{
|
406 |
+
"repo_id": "syzym/icefall-asr-xbmu-amdo31-pruned-transducer-stateless5-2022-11-29",
|
407 |
+
"nn_model_file": "cpu_jit-epoch-28-avg-23-torch-1.10.0.pt",
|
408 |
+
"nn_model_file_sub_folder": "exp",
|
409 |
+
"tokens_file": "tokens.txt",
|
410 |
+
"tokens_file_sub_folder": "data/lang_bpe_500",
|
411 |
+
"loader": "load_sherpa_offline_recognizer",
|
412 |
+
},
|
413 |
+
],
|
414 |
}
|
415 |
|
416 |
|
|
|
498 |
return recognizer
|
499 |
|
500 |
|
501 |
+
def load_sherpa_onnx_offline_recognizer_from_paraformer(nn_model_file: str,
|
502 |
+
tokens_file: str,
|
503 |
+
sample_rate: int = 16000,
|
504 |
+
decoding_method: str = "greedy_search",
|
505 |
+
feature_dim: int = 80,
|
506 |
+
num_threads: int = 2,
|
507 |
+
):
|
508 |
recognizer = sherpa_onnx.OfflineRecognizer.from_paraformer(
|
509 |
paraformer=nn_model_file,
|
510 |
tokens=tokens_file,
|
|
|
517 |
return recognizer
|
518 |
|
519 |
|
520 |
+
def load_sherpa_onnx_offline_recognizer_from_transducer(encoder_model_file: str,
|
521 |
+
decoder_model_file: str,
|
522 |
+
joiner_model_file: str,
|
523 |
+
tokens_file: str,
|
524 |
+
sample_rate: int = 16000,
|
525 |
+
decoding_method: str = "greedy_search",
|
526 |
+
feature_dim: int = 80,
|
527 |
+
num_threads: int = 2,
|
528 |
+
num_active_paths: int = 2,
|
529 |
+
):
|
530 |
recognizer = sherpa_onnx.OfflineRecognizer.from_transducer(
|
531 |
encoder=encoder_model_file,
|
532 |
decoder=decoder_model_file,
|
|
|
541 |
return recognizer
|
542 |
|
543 |
|
544 |
+
def load_sherpa_onnx_offline_recognizer_from_whisper(encoder_model_file: str,
|
545 |
+
decoder_model_file: str,
|
546 |
+
tokens_file: str,
|
547 |
+
num_threads: int = 2,
|
548 |
+
):
|
549 |
recognizer = sherpa_onnx.OfflineRecognizer.from_whisper(
|
550 |
encoder=encoder_model_file,
|
551 |
decoder=decoder_model_file,
|
|
|
555 |
return recognizer
|
556 |
|
557 |
|
558 |
+
def load_sherpa_onnx_online_recognizer_from_transducer(encoder_model_file: str,
|
559 |
+
decoder_model_file: str,
|
560 |
+
joiner_model_file: str,
|
561 |
+
tokens_file: str,
|
562 |
+
sample_rate: int = 16000,
|
563 |
+
decoding_method: str = "greedy_search",
|
564 |
+
feature_dim: int = 80,
|
565 |
+
num_threads: int = 2,
|
566 |
+
num_active_paths: int = 2,
|
567 |
+
):
|
568 |
+
recognizer = sherpa_onnx.OnlineRecognizer.from_transducer(
|
569 |
encoder=encoder_model_file,
|
570 |
decoder=decoder_model_file,
|
571 |
joiner=joiner_model_file,
|
|
|
579 |
return recognizer
|
580 |
|
581 |
|
582 |
+
def load_sherpa_onnx_online_recognizer_from_paraformer(encoder_model_file: str,
|
583 |
+
decoder_model_file: str,
|
584 |
+
tokens_file: str,
|
585 |
+
sample_rate: int = 16000,
|
586 |
+
decoding_method: str = "greedy_search",
|
587 |
+
feature_dim: int = 80,
|
588 |
+
num_threads: int = 2,
|
589 |
+
):
|
590 |
+
recognizer = sherpa_onnx.OnlineRecognizer.from_paraformer(
|
591 |
+
encoder=encoder_model_file,
|
592 |
+
decoder=decoder_model_file,
|
593 |
+
tokens=tokens_file,
|
594 |
+
num_threads=num_threads,
|
595 |
+
sample_rate=sample_rate,
|
596 |
+
feature_dim=feature_dim,
|
597 |
+
decoding_method=decoding_method,
|
598 |
+
)
|
599 |
+
return recognizer
|
600 |
+
|
601 |
+
|
602 |
def load_recognizer(local_model_dir: Path,
|
603 |
decoding_method: str = "greedy_search",
|
604 |
num_active_paths: int = 4,
|
|
|
639 |
num_active_paths=num_active_paths,
|
640 |
**kwargs_
|
641 |
)
|
642 |
+
elif loader == "load_sherpa_onnx_offline_recognizer_from_paraformer":
|
643 |
+
recognizer = load_sherpa_onnx_offline_recognizer_from_paraformer(
|
644 |
decoding_method=decoding_method,
|
645 |
**kwargs_
|
646 |
)
|
647 |
+
elif loader == "load_sherpa_onnx_offline_recognizer_from_transducer":
|
648 |
+
recognizer = load_sherpa_onnx_offline_recognizer_from_transducer(
|
649 |
decoding_method=decoding_method,
|
650 |
**kwargs_
|
651 |
)
|
652 |
+
elif loader == "load_sherpa_onnx_offline_recognizer_from_whisper":
|
653 |
+
recognizer = load_sherpa_onnx_offline_recognizer_from_whisper(
|
654 |
+
**kwargs_
|
655 |
+
)
|
656 |
+
elif loader == "load_sherpa_onnx_online_recognizer_from_transducer":
|
657 |
+
recognizer = load_sherpa_onnx_online_recognizer_from_transducer(
|
658 |
**kwargs_
|
659 |
)
|
660 |
+
elif loader == "load_sherpa_onnx_online_recognizer_from_paraformer":
|
661 |
+
recognizer = load_sherpa_onnx_online_recognizer_from_paraformer(
|
662 |
**kwargs_
|
663 |
)
|
664 |
else:
|