Update README.md
Browse files
README.md
CHANGED
@@ -17,16 +17,16 @@ model-index:
|
|
17 |
metrics:
|
18 |
- type: bleu
|
19 |
name: ko2en
|
20 |
-
value:
|
21 |
- type: bleu
|
22 |
name: ko2en-cot
|
23 |
-
value:
|
24 |
- type: bleu
|
25 |
name: en2ko (ko-mecab)
|
26 |
-
value:
|
27 |
- type: bleu
|
28 |
name: en2ko-cot (ko-mecab)
|
29 |
-
value:
|
30 |
- task:
|
31 |
type: automatic-speech-recognition
|
32 |
dataset:
|
@@ -35,7 +35,7 @@ model-index:
|
|
35 |
metrics:
|
36 |
- type: cer
|
37 |
name: test CER
|
38 |
-
value:
|
39 |
language:
|
40 |
- ko
|
41 |
---
|
@@ -47,4 +47,69 @@ model is trained only 174 steps on zeroth train set, and main purpose is to chec
|
|
47 |
|
48 |
## Evaluation
|
49 |
|
50 |
-
ASR on zeroth-test set and fleurs ko <-> en speech translation result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
metrics:
|
18 |
- type: bleu
|
19 |
name: ko2en
|
20 |
+
value: 7.07
|
21 |
- type: bleu
|
22 |
name: ko2en-cot
|
23 |
+
value: 9.19
|
24 |
- type: bleu
|
25 |
name: en2ko (ko-mecab)
|
26 |
+
value: 13.08
|
27 |
- type: bleu
|
28 |
name: en2ko-cot (ko-mecab)
|
29 |
+
value: 9.35
|
30 |
- task:
|
31 |
type: automatic-speech-recognition
|
32 |
dataset:
|
|
|
35 |
metrics:
|
36 |
- type: cer
|
37 |
name: test CER
|
38 |
+
value: 7.02
|
39 |
language:
|
40 |
- ko
|
41 |
---
|
|
|
47 |
|
48 |
## Evaluation
|
49 |
|
50 |
+
ASR on zeroth-test set and Speech translation on fleurs ko <-> en speech translation result. script is [here](https://gist.github.com/seastar105/d1d8983b27611370528e3b194dcc5577#file-evaluate-py), and used 1 A40.
|
51 |
+
|
52 |
+
|
53 |
+
| Model | zeroth-test | fleurs-ko2en | fleurs-ko2en-cot | fleurs-en2ko | fleurs-en2ko-cot |
|
54 |
+
|----------|------------|--------------|------------------|--------------|------------------|
|
55 |
+
| original | 195.92 | 5.62 | 2.45 | 6.87 | 4.35 |
|
56 |
+
| finetune (this model) | 7.02 | 7.07 | 9.19 | 13.08 | 9.35 |
|
57 |
+
|
58 |
+
|
59 |
+
## Example script
|
60 |
+
|
61 |
+
```python
|
62 |
+
orig_model_path = "microsoft/Phi-4-multimodal-instruct"
|
63 |
+
ft_model_path = "seastar105/Phi-4-mm-inst-zeroth-kor"
|
64 |
+
generation_config = GenerationConfig.from_pretrained(orig_model_path, 'generation_config.json')
|
65 |
+
processor = AutoProcessor.from_pretrained(orig_model_path, trust_remote_code=True)
|
66 |
+
|
67 |
+
model = AutoModelForCausalLM.from_pretrained(
|
68 |
+
ft_model_path,
|
69 |
+
trust_remote_code=True,
|
70 |
+
torch_dtype='auto',
|
71 |
+
_attn_implementation='flash_attention_2',
|
72 |
+
).cuda()
|
73 |
+
|
74 |
+
user_prompt = '<|user|>'
|
75 |
+
assistant_prompt = '<|assistant|>'
|
76 |
+
prompt_suffix = '<|end|>'
|
77 |
+
|
78 |
+
# task prompt is from technical report
|
79 |
+
asr_prompt = f'{user_prompt}<|audio_1|>Transcribe the audio clip into text.{prompt_suffix}{assistant_prompt}'
|
80 |
+
ast_ko_prompt = f'{user_prompt}<|audio_1|>Translate the audio to Korean.{prompt_suffix}{assistant_prompt}'
|
81 |
+
ast_cot_ko_prompt = f'{user_prompt}<|audio_1|>Transcribe the audio to text, and then translate the audio to Korean. Use <sep> as a separator between the original transcript and the translation.{prompt_suffix}{assistant_prompt}'
|
82 |
+
ast_en_prompt = f'{user_prompt}<|audio_1|>Translate the audio to English.{prompt_suffix}{assistant_prompt}'
|
83 |
+
ast_cot_en_prompt = f'{user_prompt}<|audio_1|>Transcribe the audio to text, and then translate the audio to English. Use <sep> as a separator between the original transcript and the translation.{prompt_suffix}{assistant_prompt}'
|
84 |
+
|
85 |
+
asr_ds = load_dataset("kresnik/zeroth_korean", split="test")
|
86 |
+
ast_ds = load_dataset("seastar105/fleurs_ko_en_test", split="train")
|
87 |
+
|
88 |
+
# ASR
|
89 |
+
item = asr_ds[0]
|
90 |
+
audio = (item["audio"]["array"], item["audio"]["sampling_rate"])
|
91 |
+
inputs = processor(text=asr_prompt, audios=[audio], return_tensors='pt').to(model.device)
|
92 |
+
generate_ids = model.generate(
|
93 |
+
**inputs,
|
94 |
+
max_new_tokens=max_new_tokens,
|
95 |
+
generation_config=generation_config,
|
96 |
+
)
|
97 |
+
generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
|
98 |
+
response = processor.batch_decode(
|
99 |
+
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
100 |
+
)[0] # "๋ชฌํ ํฌ์ ์๋
๋ค์ด ์ฌ๋์ ์ ๋๋ก ๋ชป ๋ฐ๊ณ ํฌ๋ฉด ๋งค์ฐ ์ฌ๊ฐํ ๊ฒฐ๊ณผ๊ฐ ์ด๋๋๋ค๋ ๊ฒฐ๋ก ์ ๋ด๋ ธ์ต๋๋ค"
|
101 |
+
|
102 |
+
# AST, EN -> KO
|
103 |
+
item = ast_ds[-1]
|
104 |
+
audio = (item["en_audio"]["array"], item["en_audio"]["sampling_rate"])
|
105 |
+
inputs = processor(text=ast_en, audios=[audio], return_tensors='pt').to(model.device)
|
106 |
+
generate_ids = model.generate(
|
107 |
+
**inputs,
|
108 |
+
max_new_tokens=max_new_tokens,
|
109 |
+
generation_config=generation_config,
|
110 |
+
)
|
111 |
+
generate_ids = generate_ids[:, inputs['input_ids'].shape[1] :]
|
112 |
+
response = processor.batch_decode(
|
113 |
+
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
114 |
+
)[0] # "๊ฐ์ฅ ์ฝ๊ฒ ์ ๊ทผ ๊ฐ๋ฅํ ์๋ฌผ ์์์ ์๊ณผ lรฉgumes์์ ์ ๊ทผ ๊ฐ๋ฅํ ๋จ๋ฐฑ์ง์ด์์ ๊ฒ์ด๋ค๊ฐ์ ํ์ง๋ง ์ด๊ฒ๋ค์ ๊ณ ํ์ ๋๋ฌผ์ฒ๋ผ ์ฐ๋ฆฌ์๊ฒ ์ํํ๊ธฐ ์ด๋ ต์ต๋๋ค๋ง ๊ทธ๊ฒ๋ค์ด ๋์ฌ ์๋ค๋ฉด์"
|
115 |
+
```
|