Upload folder using huggingface_hub
Browse files- README.md +14 -2
- chat_template.jinja +1 -1
- config.json +0 -1
- modeling_midashenglm.py +57 -77
README.md
CHANGED
@@ -1,17 +1,29 @@
|
|
1 |
---
|
2 |
-
license:
|
|
|
3 |
language:
|
4 |
- en
|
5 |
- zh
|
|
|
6 |
pipeline_tag: audio-text-to-text
|
7 |
tags:
|
8 |
- multimodal
|
9 |
- audio-language-model
|
10 |
- audio
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
base_model:
|
12 |
- mispeech/dasheng-0.6B
|
13 |
- Qwen/Qwen2.5-Omni-3B
|
14 |
base_model_relation: finetune
|
|
|
15 |
---
|
16 |
|
17 |
# MiDashengLM
|
@@ -118,4 +130,4 @@ base_model_relation: finetune
|
|
118 |
|
119 |
```bibtex
|
120 |
TODO
|
121 |
-
```
|
|
|
1 |
---
|
2 |
+
license: apache-2.0
|
3 |
+
# TODO 什么License?
|
4 |
language:
|
5 |
- en
|
6 |
- zh
|
7 |
+
# TODO 明确支持的语言
|
8 |
pipeline_tag: audio-text-to-text
|
9 |
tags:
|
10 |
- multimodal
|
11 |
- audio-language-model
|
12 |
- audio
|
13 |
+
# - audio-captioning
|
14 |
+
# - audio-classification
|
15 |
+
# - audio-generation
|
16 |
+
# - audio-question-answering
|
17 |
+
# - audio-understanding
|
18 |
+
# - chat
|
19 |
+
# - speech-recognition
|
20 |
+
# - text-to-speech
|
21 |
+
# TODO 有什么能力
|
22 |
base_model:
|
23 |
- mispeech/dasheng-0.6B
|
24 |
- Qwen/Qwen2.5-Omni-3B
|
25 |
base_model_relation: finetune
|
26 |
+
# TODO 检查是否正确
|
27 |
---
|
28 |
|
29 |
# MiDashengLM
|
|
|
130 |
|
131 |
```bibtex
|
132 |
TODO
|
133 |
+
```
|
chat_template.jinja
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
{%- for message in messages -%}
|
2 |
{%- if loop.first and message["role"] != "system" -%}
|
3 |
-
{{- "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" -}}
|
4 |
{%- endif -%}
|
5 |
{{- "<|im_start|>" -}}
|
6 |
{{- message["role"] -}}
|
|
|
1 |
{%- for message in messages -%}
|
2 |
{%- if loop.first and message["role"] != "system" -%}
|
3 |
+
{{- "<|im_start|>system\nYou are a helpful language and speech assistant.<|im_end|>\n" -}}
|
4 |
{%- endif -%}
|
5 |
{{- "<|im_start|>" -}}
|
6 |
{{- message["role"] -}}
|
config.json
CHANGED
@@ -36,7 +36,6 @@
|
|
36 |
"AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
|
37 |
"AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
|
38 |
},
|
39 |
-
"lora_target_modules": "all-linear",
|
40 |
"model_type": "midashenglm",
|
41 |
"subsample_factor": 5,
|
42 |
"text_config": {
|
|
|
36 |
"AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
|
37 |
"AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
|
38 |
},
|
|
|
39 |
"model_type": "midashenglm",
|
40 |
"subsample_factor": 5,
|
41 |
"text_config": {
|
modeling_midashenglm.py
CHANGED
@@ -474,44 +474,22 @@ class MiDashengLMModel(PreTrainedModel):
|
|
474 |
|
475 |
return encoder_out
|
476 |
|
477 |
-
def
|
478 |
self,
|
479 |
-
input_ids: torch.Tensor,
|
480 |
-
|
481 |
-
|
482 |
-
) -> torch.Tensor:
|
483 |
-
input_embeddings = self.decoder.model.embed_tokens(input_ids)
|
484 |
-
if audio_embeddings is not None:
|
485 |
-
special_mask = input_ids == audio_token_id
|
486 |
-
assert audio_embeddings.shape[1] <= (special_mask.sum(-1)).max(), (
|
487 |
-
"Mask and audio embeddings seem to have different sizes: "
|
488 |
-
f"{audio_embeddings.shape=}, {special_mask=}, {input_ids=}, "
|
489 |
-
f"{audio_embeddings.shape[1]=} vs {(special_mask.sum(-1)).max()=}"
|
490 |
-
)
|
491 |
-
audio_embeddings = audio_embeddings.to(input_embeddings.dtype)
|
492 |
-
|
493 |
-
for i in range(len(special_mask)):
|
494 |
-
mask = special_mask[i]
|
495 |
-
number_of_tokens = mask.sum(-1)
|
496 |
-
input_embeddings[i, mask] = audio_embeddings[i, :number_of_tokens]
|
497 |
-
return input_embeddings
|
498 |
-
else:
|
499 |
-
return input_embeddings
|
500 |
-
|
501 |
-
def forward(
|
502 |
-
self,
|
503 |
-
input_ids: Optional[Tensor] = None,
|
504 |
-
input_values: Optional[Tensor] = None,
|
505 |
-
inputs_embeds: Optional[Tensor] = None,
|
506 |
audio_length: Optional[Iterable[int]] = None,
|
507 |
audio_token_id: Optional[int] = None,
|
508 |
-
|
509 |
-
):
|
510 |
if input_ids is not None:
|
511 |
if inputs_embeds is not None:
|
512 |
raise ValueError(
|
513 |
"Both `inputs_embeds` and `input_ids` are passed. Please pass only one of them."
|
514 |
)
|
|
|
|
|
|
|
515 |
|
516 |
if input_values is not None:
|
517 |
if audio_token_id is None:
|
@@ -519,25 +497,31 @@ class MiDashengLMModel(PreTrainedModel):
|
|
519 |
"If `input_values` is provided, `audio_token_id` must also be provided."
|
520 |
)
|
521 |
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
)
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
|
|
|
|
533 |
)
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
|
|
|
|
|
|
|
|
|
541 |
else:
|
542 |
if inputs_embeds is None:
|
543 |
raise ValueError(
|
@@ -548,6 +532,24 @@ class MiDashengLMModel(PreTrainedModel):
|
|
548 |
"Cannot pass `input_values` when `inputs_embeds` is provided."
|
549 |
)
|
550 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
return self.decoder(
|
552 |
input_ids=None,
|
553 |
inputs_embeds=inputs_embeds,
|
@@ -563,35 +565,13 @@ class MiDashengLMModel(PreTrainedModel):
|
|
563 |
audio_token_id: Optional[int] = None,
|
564 |
**kwargs,
|
565 |
):
|
566 |
-
|
567 |
-
|
568 |
-
|
569 |
-
|
570 |
-
|
571 |
-
|
572 |
-
|
573 |
-
if input_values is not None:
|
574 |
-
input_values = input_values.to(self.device)
|
575 |
-
audio_encoder_hidden_states = self._forward_audio_encoder(
|
576 |
-
input_values, audio_length=audio_length
|
577 |
-
)
|
578 |
-
else:
|
579 |
-
audio_encoder_hidden_states = None
|
580 |
-
inputs_embeds = self._prepare_with_input_ids(
|
581 |
-
input_ids=input_ids,
|
582 |
-
audio_embeddings=audio_encoder_hidden_states,
|
583 |
-
audio_token_id=audio_token_id,
|
584 |
-
)
|
585 |
-
else:
|
586 |
-
if inputs_embeds is None:
|
587 |
-
raise ValueError(
|
588 |
-
"Either `input_ids` or `inputs_embeds` must be passed."
|
589 |
-
)
|
590 |
-
if input_values is not None:
|
591 |
-
raise ValueError(
|
592 |
-
"Cannot pass `input_values` when `inputs_embeds` is provided."
|
593 |
-
)
|
594 |
-
|
595 |
return self.decoder.generate(
|
596 |
inputs_embeds=inputs_embeds,
|
597 |
generation_config=kwargs.pop("generation_config", self.generation_config),
|
|
|
474 |
|
475 |
return encoder_out
|
476 |
|
477 |
+
def _prepare_inputs_embeds(
|
478 |
self,
|
479 |
+
input_ids: Optional[torch.Tensor],
|
480 |
+
input_values: Optional[torch.Tensor],
|
481 |
+
inputs_embeds: Optional[torch.Tensor],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
482 |
audio_length: Optional[Iterable[int]] = None,
|
483 |
audio_token_id: Optional[int] = None,
|
484 |
+
) -> torch.Tensor:
|
|
|
485 |
if input_ids is not None:
|
486 |
if inputs_embeds is not None:
|
487 |
raise ValueError(
|
488 |
"Both `inputs_embeds` and `input_ids` are passed. Please pass only one of them."
|
489 |
)
|
490 |
+
inputs_embeds = cast(
|
491 |
+
torch.Tensor, self.decoder.model.embed_tokens(input_ids)
|
492 |
+
)
|
493 |
|
494 |
if input_values is not None:
|
495 |
if audio_token_id is None:
|
|
|
497 |
"If `input_values` is provided, `audio_token_id` must also be provided."
|
498 |
)
|
499 |
|
500 |
+
audio_embeddings = self._forward_audio_encoder(
|
501 |
+
input_values,
|
502 |
+
audio_length=audio_length,
|
503 |
+
).to(inputs_embeds.dtype)
|
504 |
+
|
505 |
+
audio_mask = (input_ids == audio_token_id).flatten()
|
506 |
+
diff = torch.diff(
|
507 |
+
audio_mask.long(),
|
508 |
+
prepend=torch.zeros(
|
509 |
+
(1,),
|
510 |
+
dtype=torch.long,
|
511 |
+
device=audio_mask.device,
|
512 |
+
),
|
513 |
)
|
514 |
+
audio_span_starts = (diff == 1).nonzero()
|
515 |
+
audio_span_ends = (diff == -1).nonzero()
|
516 |
+
|
517 |
+
embeds_view = inputs_embeds.view(-1, inputs_embeds.shape[-1])
|
518 |
+
for span_start, span_end, audio in zip(
|
519 |
+
audio_span_starts,
|
520 |
+
audio_span_ends,
|
521 |
+
audio_embeddings,
|
522 |
+
strict=True,
|
523 |
+
):
|
524 |
+
embeds_view[span_start:span_end] = audio[: span_end - span_start]
|
525 |
else:
|
526 |
if inputs_embeds is None:
|
527 |
raise ValueError(
|
|
|
532 |
"Cannot pass `input_values` when `inputs_embeds` is provided."
|
533 |
)
|
534 |
|
535 |
+
return inputs_embeds
|
536 |
+
|
537 |
+
def forward(
|
538 |
+
self,
|
539 |
+
input_ids: Optional[Tensor] = None,
|
540 |
+
input_values: Optional[Tensor] = None,
|
541 |
+
inputs_embeds: Optional[Tensor] = None,
|
542 |
+
audio_length: Optional[Iterable[int]] = None,
|
543 |
+
audio_token_id: Optional[int] = None,
|
544 |
+
**kwargs: Any,
|
545 |
+
):
|
546 |
+
inputs_embeds = self._prepare_inputs_embeds(
|
547 |
+
input_ids=input_ids,
|
548 |
+
input_values=input_values,
|
549 |
+
inputs_embeds=inputs_embeds,
|
550 |
+
audio_length=audio_length,
|
551 |
+
audio_token_id=audio_token_id,
|
552 |
+
)
|
553 |
return self.decoder(
|
554 |
input_ids=None,
|
555 |
inputs_embeds=inputs_embeds,
|
|
|
565 |
audio_token_id: Optional[int] = None,
|
566 |
**kwargs,
|
567 |
):
|
568 |
+
inputs_embeds = self._prepare_inputs_embeds(
|
569 |
+
input_ids=input_ids,
|
570 |
+
input_values=input_values,
|
571 |
+
inputs_embeds=inputs_embeds,
|
572 |
+
audio_length=audio_length,
|
573 |
+
audio_token_id=audio_token_id,
|
574 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
575 |
return self.decoder.generate(
|
576 |
inputs_embeds=inputs_embeds,
|
577 |
generation_config=kwargs.pop("generation_config", self.generation_config),
|