init
Browse files- README.md +71 -27
- pipeline/push_pipeline.py +2 -2
README.md
CHANGED
@@ -15,8 +15,8 @@ widget:
|
|
15 |
# Kotoba-Whisper-v2.2
|
16 |
_Kotoba-Whisper-v2.2_ is a Japanese ASR model based on [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0), with
|
17 |
additional postprocessing stacks integrated as [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines). The new features includes
|
18 |
-
(i)
|
19 |
-
|
20 |
The pipeline has been developed through the collaboration between [Asahi Ushio](https://asahiushio.com) and [Kotoba Technologies](https://twitter.com/kotoba_tech)
|
21 |
|
22 |
## Transformers Usage
|
@@ -30,20 +30,33 @@ pip install "punctuators==0.0.5"
|
|
30 |
pip install "pyannote.audio"
|
31 |
pip install git+https://github.com/huggingface/diarizers.git
|
32 |
```
|
33 |
-
Also,
|
34 |
|
|
|
|
|
|
|
|
|
35 |
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
|
40 |
```python
|
41 |
import torch
|
42 |
from transformers import pipeline
|
43 |
-
from datasets import load_dataset
|
44 |
|
45 |
# config
|
46 |
-
model_id = "kotoba-tech/kotoba-whisper-v2.
|
47 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
48 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
49 |
model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
|
@@ -58,35 +71,66 @@ pipe = pipeline(
|
|
58 |
chunk_length_s=15,
|
59 |
batch_size=16,
|
60 |
trust_remote_code=True,
|
61 |
-
|
62 |
-
|
63 |
)
|
64 |
|
65 |
-
# load sample audio
|
66 |
-
dataset = load_dataset("japanese-asr/ja_asr.reazonspeech_test", split="test")
|
67 |
-
sample = dataset[0]["audio"]
|
68 |
-
|
69 |
# run inference
|
70 |
-
result = pipe(
|
71 |
print(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
```
|
73 |
|
74 |
-
- To
|
75 |
```diff
|
76 |
-
-
|
77 |
-
+
|
78 |
-
```
|
79 |
-
|
80 |
-
- To deactivate stable-ts:
|
81 |
-
```diff
|
82 |
-
- stable_ts=True,
|
83 |
-
+ stable_ts=False,
|
84 |
```
|
85 |
|
86 |
-
- To
|
87 |
```diff
|
88 |
-
-
|
89 |
-
+
|
90 |
```
|
91 |
|
92 |
|
|
|
15 |
# Kotoba-Whisper-v2.2
|
16 |
_Kotoba-Whisper-v2.2_ is a Japanese ASR model based on [kotoba-tech/kotoba-whisper-v2.0](https://huggingface.co/kotoba-tech/kotoba-whisper-v2.0), with
|
17 |
additional postprocessing stacks integrated as [`pipeline`](https://huggingface.co/docs/transformers/en/main_classes/pipelines). The new features includes
|
18 |
+
(i) speaker diarization with [diarizers](https://huggingface.co/diarizers-community/speaker-segmentation-fine-tuned-callhome-jpn)
|
19 |
+
and (ii) adding punctuation with [punctuators](https://github.com/1-800-BAD-CODE/punctuators/tree/main).
|
20 |
The pipeline has been developed through the collaboration between [Asahi Ushio](https://asahiushio.com) and [Kotoba Technologies](https://twitter.com/kotoba_tech)
|
21 |
|
22 |
## Transformers Usage
|
|
|
30 |
pip install "pyannote.audio"
|
31 |
pip install git+https://github.com/huggingface/diarizers.git
|
32 |
```
|
|
|
33 |
|
34 |
+
To load pre-trained diarization models from the Hub, you'll first need to accept the terms-of-use for the following two models:
|
35 |
+
1. [pyannote/segmentation-3.0](https://hf.co/pyannote/segmentation-3.0)
|
36 |
+
2. [pyannote/speaker-diarization-3.1](https://hf.co/pyannote/speaker-diarization-3.1)
|
37 |
+
And subsequently use a Hugging Face authentication token to log in with:
|
38 |
|
39 |
+
```
|
40 |
+
huggingface-cli login
|
41 |
+
```
|
42 |
+
|
43 |
+
|
44 |
+
### Transcription with Diarization
|
45 |
+
The model can be used with the [`pipeline`](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline).
|
46 |
+
|
47 |
+
- Download an audio sample.
|
48 |
+
```shell
|
49 |
+
wget https://huggingface.co/kotoba-tech/kotoba-whisper-v2.2/resolve/main/sample_audio/sample_diarization_japanese.mp3
|
50 |
+
```
|
51 |
+
|
52 |
+
- Run the model via pipeline.
|
53 |
|
54 |
```python
|
55 |
import torch
|
56 |
from transformers import pipeline
|
|
|
57 |
|
58 |
# config
|
59 |
+
model_id = "kotoba-tech/kotoba-whisper-v2.2"
|
60 |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
61 |
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
62 |
model_kwargs = {"attn_implementation": "sdpa"} if torch.cuda.is_available() else {}
|
|
|
71 |
chunk_length_s=15,
|
72 |
batch_size=16,
|
73 |
trust_remote_code=True,
|
74 |
+
punctuator=False,
|
75 |
+
return_unique_speaker=True
|
76 |
)
|
77 |
|
|
|
|
|
|
|
|
|
78 |
# run inference
|
79 |
+
result = pipe("sample_diarization_japanese.mp3", generate_kwargs=generate_kwargs)
|
80 |
print(result)
|
81 |
+
>>>
|
82 |
+
{'chunks': [{'speaker': ['SPEAKER_02'],
|
83 |
+
'text': 'そうですねこれも先ほどがずっと言っている自分の感覚的には大丈夫ですけれども',
|
84 |
+
'timestamp': (0.0, 5.0)},
|
85 |
+
{'speaker': ['SPEAKER_02'],
|
86 |
+
'text': '今は屋外の気温',
|
87 |
+
'timestamp': (5.0, 7.6)},
|
88 |
+
{'speaker': ['SPEAKER_02'],
|
89 |
+
'text': '昼も夜も上がってますので空気の入れ替えだけでは',
|
90 |
+
'timestamp': (7.6, 11.72)},
|
91 |
+
{'speaker': ['SPEAKER_02'],
|
92 |
+
'text': 'かえって人が上がってきます',
|
93 |
+
'timestamp': (11.72, 13.54)},
|
94 |
+
{'speaker': ['SPEAKER_02'],
|
95 |
+
'text': 'やっぱり愚直にやっぱりその街の良さをアピールしていくっていう',
|
96 |
+
'timestamp': (13.54, 17.24)},
|
97 |
+
{'speaker': ['SPEAKER_00'],
|
98 |
+
'text': 'そういう姿勢が基本にあった上だのこういうPR作戦だと思うんです',
|
99 |
+
'timestamp': (17.24, 23.84)}],
|
100 |
+
'chunks/SPEAKER_00': [{'speaker': ['SPEAKER_00'],
|
101 |
+
'text': 'そういう姿勢が基本にあった上だのこういうPR作戦だと思うんです',
|
102 |
+
'timestamp': (17.24, 23.84)}],
|
103 |
+
'chunks/SPEAKER_02': [{'speaker': ['SPEAKER_02'],
|
104 |
+
'text': 'そうですねこれも先ほどがずっと言っている自分の感覚的には大丈夫ですけれども',
|
105 |
+
'timestamp': (0.0, 5.0)},
|
106 |
+
{'speaker': ['SPEAKER_02'],
|
107 |
+
'text': '今は屋外の気温',
|
108 |
+
'timestamp': (5.0, 7.6)},
|
109 |
+
{'speaker': ['SPEAKER_02'],
|
110 |
+
'text': '昼も夜も上がってますので空気の入れ替えだけでは',
|
111 |
+
'timestamp': (7.6, 11.72)},
|
112 |
+
{'speaker': ['SPEAKER_02'],
|
113 |
+
'text': 'かえって人が上がってきます',
|
114 |
+
'timestamp': (11.72, 13.54)},
|
115 |
+
{'speaker': ['SPEAKER_02'],
|
116 |
+
'text': 'やっぱり愚直にやっぱりその街の良さをアピールしていくっていう',
|
117 |
+
'timestamp': (13.54, 17.24)}],
|
118 |
+
'speakers': ['SPEAKER_00', 'SPEAKER_02'],
|
119 |
+
'text': 'そうですねこれも先ほどがずっと言っている自分の感覚的には大丈夫ですけれども今は屋外の気温昼も夜も上がってますので空気の入れ替えだけではかえって人が上がってきますやっぱり愚直にやっぱりその街の良さをアピールしていくっていうそういう姿勢が基本にあった上だのこういうPR作戦だと思うんです',
|
120 |
+
'text/SPEAKER_00': 'そういう姿勢が基本にあった上だのこういうPR作戦だと思うんです',
|
121 |
+
'text/SPEAKER_02': 'そうですねこれも先ほどがずっと言っている自分の感覚的には大丈夫ですけれども今は屋外の気温昼も夜も上がってますので空気の入れ替えだけではかえって人が上がってきますやっぱり愚直にやっぱりその街の良さをアピールしていくっていう'}
|
122 |
```
|
123 |
|
124 |
+
- To activate punctuator:
|
125 |
```diff
|
126 |
+
- punctuator=True,
|
127 |
+
+ punctuator=False,
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
```
|
129 |
|
130 |
+
- To include more than speakers:
|
131 |
```diff
|
132 |
+
- return_unique_speaker=True
|
133 |
+
+ return_unique_speaker=False
|
134 |
```
|
135 |
|
136 |
|
pipeline/push_pipeline.py
CHANGED
@@ -14,8 +14,8 @@ PIPELINE_REGISTRY.register_pipeline(
|
|
14 |
tf_model=TFWhisperForConditionalGeneration
|
15 |
)
|
16 |
pipe = pipeline(task="kotoba-whisper", model="kotoba-tech/kotoba-whisper-v2.0", chunk_length_s=15, batch_size=16)
|
17 |
-
|
18 |
-
|
19 |
pipe.push_to_hub(model_alias)
|
20 |
|
21 |
|
|
|
14 |
tf_model=TFWhisperForConditionalGeneration
|
15 |
)
|
16 |
pipe = pipeline(task="kotoba-whisper", model="kotoba-tech/kotoba-whisper-v2.0", chunk_length_s=15, batch_size=16)
|
17 |
+
output = pipe(test_audio)
|
18 |
+
pprint(output)
|
19 |
pipe.push_to_hub(model_alias)
|
20 |
|
21 |
|