zhoukz commited on
Commit
0939826
·
1 Parent(s): 0d4354c

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +14 -2
  2. chat_template.jinja +1 -1
  3. config.json +0 -1
  4. modeling_midashenglm.py +57 -77
README.md CHANGED
@@ -1,17 +1,29 @@
1
  ---
2
- license: other
 
3
  language:
4
  - en
5
  - zh
 
6
  pipeline_tag: audio-text-to-text
7
  tags:
8
  - multimodal
9
  - audio-language-model
10
  - audio
 
 
 
 
 
 
 
 
 
11
  base_model:
12
  - mispeech/dasheng-0.6B
13
  - Qwen/Qwen2.5-Omni-3B
14
  base_model_relation: finetune
 
15
  ---
16
 
17
  # MiDashengLM
@@ -118,4 +130,4 @@ base_model_relation: finetune
118
 
119
  ```bibtex
120
  TODO
121
- ```
 
1
  ---
2
+ license: apache-2.0
3
+ # TODO 什么License?
4
  language:
5
  - en
6
  - zh
7
+ # TODO 明确支持的语言
8
  pipeline_tag: audio-text-to-text
9
  tags:
10
  - multimodal
11
  - audio-language-model
12
  - audio
13
+ # - audio-captioning
14
+ # - audio-classification
15
+ # - audio-generation
16
+ # - audio-question-answering
17
+ # - audio-understanding
18
+ # - chat
19
+ # - speech-recognition
20
+ # - text-to-speech
21
+ # TODO 有什么能力
22
  base_model:
23
  - mispeech/dasheng-0.6B
24
  - Qwen/Qwen2.5-Omni-3B
25
  base_model_relation: finetune
26
+ # TODO 检查是否正确
27
  ---
28
 
29
  # MiDashengLM
 
130
 
131
  ```bibtex
132
  TODO
133
+ ```
chat_template.jinja CHANGED
@@ -1,6 +1,6 @@
1
  {%- for message in messages -%}
2
  {%- if loop.first and message["role"] != "system" -%}
3
- {{- "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" -}}
4
  {%- endif -%}
5
  {{- "<|im_start|>" -}}
6
  {{- message["role"] -}}
 
1
  {%- for message in messages -%}
2
  {%- if loop.first and message["role"] != "system" -%}
3
+ {{- "<|im_start|>system\nYou are a helpful language and speech assistant.<|im_end|>\n" -}}
4
  {%- endif -%}
5
  {{- "<|im_start|>" -}}
6
  {{- message["role"] -}}
config.json CHANGED
@@ -36,7 +36,6 @@
36
  "AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
37
  "AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
38
  },
39
- "lora_target_modules": "all-linear",
40
  "model_type": "midashenglm",
41
  "subsample_factor": 5,
42
  "text_config": {
 
36
  "AutoConfig": "configuration_midashenglm.MiDashengLMConfig",
37
  "AutoModelForCausalLM": "modeling_midashenglm.MiDashengLMModel"
38
  },
 
39
  "model_type": "midashenglm",
40
  "subsample_factor": 5,
41
  "text_config": {
modeling_midashenglm.py CHANGED
@@ -474,44 +474,22 @@ class MiDashengLMModel(PreTrainedModel):
474
 
475
  return encoder_out
476
 
477
- def _prepare_with_input_ids(
478
  self,
479
- input_ids: torch.Tensor,
480
- audio_embeddings: Optional[torch.Tensor],
481
- audio_token_id: Optional[int],
482
- ) -> torch.Tensor:
483
- input_embeddings = self.decoder.model.embed_tokens(input_ids)
484
- if audio_embeddings is not None:
485
- special_mask = input_ids == audio_token_id
486
- assert audio_embeddings.shape[1] <= (special_mask.sum(-1)).max(), (
487
- "Mask and audio embeddings seem to have different sizes: "
488
- f"{audio_embeddings.shape=}, {special_mask=}, {input_ids=}, "
489
- f"{audio_embeddings.shape[1]=} vs {(special_mask.sum(-1)).max()=}"
490
- )
491
- audio_embeddings = audio_embeddings.to(input_embeddings.dtype)
492
-
493
- for i in range(len(special_mask)):
494
- mask = special_mask[i]
495
- number_of_tokens = mask.sum(-1)
496
- input_embeddings[i, mask] = audio_embeddings[i, :number_of_tokens]
497
- return input_embeddings
498
- else:
499
- return input_embeddings
500
-
501
- def forward(
502
- self,
503
- input_ids: Optional[Tensor] = None,
504
- input_values: Optional[Tensor] = None,
505
- inputs_embeds: Optional[Tensor] = None,
506
  audio_length: Optional[Iterable[int]] = None,
507
  audio_token_id: Optional[int] = None,
508
- **kwargs: Any,
509
- ):
510
  if input_ids is not None:
511
  if inputs_embeds is not None:
512
  raise ValueError(
513
  "Both `inputs_embeds` and `input_ids` are passed. Please pass only one of them."
514
  )
 
 
 
515
 
516
  if input_values is not None:
517
  if audio_token_id is None:
@@ -519,25 +497,31 @@ class MiDashengLMModel(PreTrainedModel):
519
  "If `input_values` is provided, `audio_token_id` must also be provided."
520
  )
521
 
522
- input_values = input_values.to(self.device)
523
- audio_encoder_hidden_states = self._forward_audio_encoder(
524
- input_values, audio_length=audio_length
525
- )
526
- else:
527
- batch, _ = input_ids.shape
528
- input_values = torch.zeros(
529
- batch,
530
- 0,
531
- self.audio_encoder.embed_dim,
532
- device=input_ids.device,
 
 
533
  )
534
-
535
- input_ids = input_ids.to(self.device)
536
- inputs_embeds = self._prepare_with_input_ids(
537
- input_ids=input_ids,
538
- audio_embeddings=audio_encoder_hidden_states,
539
- audio_token_id=audio_token_id,
540
- )
 
 
 
 
541
  else:
542
  if inputs_embeds is None:
543
  raise ValueError(
@@ -548,6 +532,24 @@ class MiDashengLMModel(PreTrainedModel):
548
  "Cannot pass `input_values` when `inputs_embeds` is provided."
549
  )
550
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
  return self.decoder(
552
  input_ids=None,
553
  inputs_embeds=inputs_embeds,
@@ -563,35 +565,13 @@ class MiDashengLMModel(PreTrainedModel):
563
  audio_token_id: Optional[int] = None,
564
  **kwargs,
565
  ):
566
- if input_ids is not None:
567
- if inputs_embeds is not None:
568
- raise ValueError(
569
- "Both `inputs_embeds` and `input_ids` are passed. Please pass only one of them."
570
- )
571
- input_ids = input_ids.to(self.device)
572
-
573
- if input_values is not None:
574
- input_values = input_values.to(self.device)
575
- audio_encoder_hidden_states = self._forward_audio_encoder(
576
- input_values, audio_length=audio_length
577
- )
578
- else:
579
- audio_encoder_hidden_states = None
580
- inputs_embeds = self._prepare_with_input_ids(
581
- input_ids=input_ids,
582
- audio_embeddings=audio_encoder_hidden_states,
583
- audio_token_id=audio_token_id,
584
- )
585
- else:
586
- if inputs_embeds is None:
587
- raise ValueError(
588
- "Either `input_ids` or `inputs_embeds` must be passed."
589
- )
590
- if input_values is not None:
591
- raise ValueError(
592
- "Cannot pass `input_values` when `inputs_embeds` is provided."
593
- )
594
-
595
  return self.decoder.generate(
596
  inputs_embeds=inputs_embeds,
597
  generation_config=kwargs.pop("generation_config", self.generation_config),
 
474
 
475
  return encoder_out
476
 
477
+ def _prepare_inputs_embeds(
478
  self,
479
+ input_ids: Optional[torch.Tensor],
480
+ input_values: Optional[torch.Tensor],
481
+ inputs_embeds: Optional[torch.Tensor],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  audio_length: Optional[Iterable[int]] = None,
483
  audio_token_id: Optional[int] = None,
484
+ ) -> torch.Tensor:
 
485
  if input_ids is not None:
486
  if inputs_embeds is not None:
487
  raise ValueError(
488
  "Both `inputs_embeds` and `input_ids` are passed. Please pass only one of them."
489
  )
490
+ inputs_embeds = cast(
491
+ torch.Tensor, self.decoder.model.embed_tokens(input_ids)
492
+ )
493
 
494
  if input_values is not None:
495
  if audio_token_id is None:
 
497
  "If `input_values` is provided, `audio_token_id` must also be provided."
498
  )
499
 
500
+ audio_embeddings = self._forward_audio_encoder(
501
+ input_values,
502
+ audio_length=audio_length,
503
+ ).to(inputs_embeds.dtype)
504
+
505
+ audio_mask = (input_ids == audio_token_id).flatten()
506
+ diff = torch.diff(
507
+ audio_mask.long(),
508
+ prepend=torch.zeros(
509
+ (1,),
510
+ dtype=torch.long,
511
+ device=audio_mask.device,
512
+ ),
513
  )
514
+ audio_span_starts = (diff == 1).nonzero()
515
+ audio_span_ends = (diff == -1).nonzero()
516
+
517
+ embeds_view = inputs_embeds.view(-1, inputs_embeds.shape[-1])
518
+ for span_start, span_end, audio in zip(
519
+ audio_span_starts,
520
+ audio_span_ends,
521
+ audio_embeddings,
522
+ strict=True,
523
+ ):
524
+ embeds_view[span_start:span_end] = audio[: span_end - span_start]
525
  else:
526
  if inputs_embeds is None:
527
  raise ValueError(
 
532
  "Cannot pass `input_values` when `inputs_embeds` is provided."
533
  )
534
 
535
+ return inputs_embeds
536
+
537
+ def forward(
538
+ self,
539
+ input_ids: Optional[Tensor] = None,
540
+ input_values: Optional[Tensor] = None,
541
+ inputs_embeds: Optional[Tensor] = None,
542
+ audio_length: Optional[Iterable[int]] = None,
543
+ audio_token_id: Optional[int] = None,
544
+ **kwargs: Any,
545
+ ):
546
+ inputs_embeds = self._prepare_inputs_embeds(
547
+ input_ids=input_ids,
548
+ input_values=input_values,
549
+ inputs_embeds=inputs_embeds,
550
+ audio_length=audio_length,
551
+ audio_token_id=audio_token_id,
552
+ )
553
  return self.decoder(
554
  input_ids=None,
555
  inputs_embeds=inputs_embeds,
 
565
  audio_token_id: Optional[int] = None,
566
  **kwargs,
567
  ):
568
+ inputs_embeds = self._prepare_inputs_embeds(
569
+ input_ids=input_ids,
570
+ input_values=input_values,
571
+ inputs_embeds=inputs_embeds,
572
+ audio_length=audio_length,
573
+ audio_token_id=audio_token_id,
574
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
575
  return self.decoder.generate(
576
  inputs_embeds=inputs_embeds,
577
  generation_config=kwargs.pop("generation_config", self.generation_config),