Upload folder using huggingface_hub
Browse files- README.md +26 -7
- chat_template.jinja +7 -0
- config.json +6 -9
- generation_config.json +1 -1
- tokenizer_config.json +0 -1
README.md
CHANGED
|
@@ -13,14 +13,24 @@ This tiny model is for debugging. It is randomly initialized with the config ada
|
|
| 13 |
### Example usage:
|
| 14 |
|
| 15 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
import soundfile as sf
|
| 17 |
from qwen_omni_utils import process_mm_info
|
| 18 |
-
from transformers import
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
model_id = "yujiepan/qwen2.5-omni-tiny-random"
|
| 21 |
# model = Qwen2_5OmniModel.from_pretrained(model_id, torch_dtype="auto", device_map="auto").eval()
|
| 22 |
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
model_id,
|
| 25 |
torch_dtype="auto",
|
| 26 |
device_map="auto",
|
|
@@ -31,14 +41,16 @@ processor = Qwen2_5OmniProcessor.from_pretrained(model_id)
|
|
| 31 |
conversation = [
|
| 32 |
{
|
| 33 |
"role": "system",
|
| 34 |
-
"content":
|
|
|
|
|
|
|
| 35 |
},
|
| 36 |
{
|
| 37 |
"role": "user",
|
| 38 |
"content": [
|
| 39 |
{"type": "text", "text": "Hi, can you tell me a joke?"},
|
| 40 |
-
{"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"},
|
| 41 |
-
{"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
|
| 42 |
{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"},
|
| 43 |
],
|
| 44 |
},
|
|
@@ -57,6 +69,7 @@ inputs = inputs.to(model.device).to(model.dtype)
|
|
| 57 |
text_ids, audio = model.generate(
|
| 58 |
**inputs, use_audio_in_video=True,
|
| 59 |
thinker_max_new_tokens=16, talker_max_new_tokens=16,
|
|
|
|
| 60 |
)
|
| 61 |
|
| 62 |
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
|
@@ -71,17 +84,20 @@ sf.write(
|
|
| 71 |
### Codes to create this repo:
|
| 72 |
|
| 73 |
```python
|
|
|
|
| 74 |
from pathlib import Path
|
| 75 |
|
| 76 |
import torch
|
| 77 |
|
|
|
|
| 78 |
from huggingface_hub import hf_hub_download
|
| 79 |
from transformers import (
|
| 80 |
AutoConfig,
|
| 81 |
AutoModelForCausalLM,
|
| 82 |
AutoTokenizer,
|
| 83 |
GenerationConfig,
|
| 84 |
-
|
|
|
|
| 85 |
Qwen2_5OmniProcessor,
|
| 86 |
pipeline,
|
| 87 |
set_seed,
|
|
@@ -166,8 +182,11 @@ for _, info in spk_dict.items():
|
|
| 166 |
info['cond'] = info['cond'][:, :config.token2wav_config.dit_config.enc_emb_dim].clone()
|
| 167 |
torch.save(spk_dict, Path(save_folder, "spk_dict.pt"))
|
| 168 |
|
|
|
|
|
|
|
|
|
|
| 169 |
torch.set_default_dtype(torch.bfloat16)
|
| 170 |
-
model =
|
| 171 |
config,
|
| 172 |
)
|
| 173 |
torch.set_default_dtype(torch.float32)
|
|
|
|
| 13 |
### Example usage:
|
| 14 |
|
| 15 |
```python
|
| 16 |
+
import unittest
|
| 17 |
+
|
| 18 |
+
import torch
|
| 19 |
+
|
| 20 |
import soundfile as sf
|
| 21 |
from qwen_omni_utils import process_mm_info
|
| 22 |
+
from transformers import (
|
| 23 |
+
Qwen2_5OmniForConditionalGeneration,
|
| 24 |
+
Qwen2_5OmniPreTrainedModel,
|
| 25 |
+
Qwen2_5OmniProcessor,
|
| 26 |
+
)
|
| 27 |
|
| 28 |
model_id = "yujiepan/qwen2.5-omni-tiny-random"
|
| 29 |
# model = Qwen2_5OmniModel.from_pretrained(model_id, torch_dtype="auto", device_map="auto").eval()
|
| 30 |
# We recommend enabling flash_attention_2 for better acceleration and memory saving.
|
| 31 |
+
|
| 32 |
+
Qwen2_5OmniPreTrainedModel._init_weights = unittest.mock.Mock()
|
| 33 |
+
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
|
| 34 |
model_id,
|
| 35 |
torch_dtype="auto",
|
| 36 |
device_map="auto",
|
|
|
|
| 41 |
conversation = [
|
| 42 |
{
|
| 43 |
"role": "system",
|
| 44 |
+
"content": [
|
| 45 |
+
{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
|
| 46 |
+
],
|
| 47 |
},
|
| 48 |
{
|
| 49 |
"role": "user",
|
| 50 |
"content": [
|
| 51 |
{"type": "text", "text": "Hi, can you tell me a joke?"},
|
| 52 |
+
# {"type": "audio", "audio": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"},
|
| 53 |
+
# {"type": "video", "video": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-Omni/draw.mp4"},
|
| 54 |
{"type": "image", "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"},
|
| 55 |
],
|
| 56 |
},
|
|
|
|
| 69 |
text_ids, audio = model.generate(
|
| 70 |
**inputs, use_audio_in_video=True,
|
| 71 |
thinker_max_new_tokens=16, talker_max_new_tokens=16,
|
| 72 |
+
temperature=0.1,
|
| 73 |
)
|
| 74 |
|
| 75 |
text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
|
|
|
| 84 |
### Codes to create this repo:
|
| 85 |
|
| 86 |
```python
|
| 87 |
+
import unittest
|
| 88 |
from pathlib import Path
|
| 89 |
|
| 90 |
import torch
|
| 91 |
|
| 92 |
+
import accelerate
|
| 93 |
from huggingface_hub import hf_hub_download
|
| 94 |
from transformers import (
|
| 95 |
AutoConfig,
|
| 96 |
AutoModelForCausalLM,
|
| 97 |
AutoTokenizer,
|
| 98 |
GenerationConfig,
|
| 99 |
+
Qwen2_5OmniForConditionalGeneration,
|
| 100 |
+
Qwen2_5OmniPreTrainedModel,
|
| 101 |
Qwen2_5OmniProcessor,
|
| 102 |
pipeline,
|
| 103 |
set_seed,
|
|
|
|
| 182 |
info['cond'] = info['cond'][:, :config.token2wav_config.dit_config.enc_emb_dim].clone()
|
| 183 |
torch.save(spk_dict, Path(save_folder, "spk_dict.pt"))
|
| 184 |
|
| 185 |
+
# patch for non-affine layernorm
|
| 186 |
+
Qwen2_5OmniPreTrainedModel._init_weights = unittest.mock.Mock()
|
| 187 |
+
|
| 188 |
torch.set_default_dtype(torch.bfloat16)
|
| 189 |
+
model = Qwen2_5OmniForConditionalGeneration(
|
| 190 |
config,
|
| 191 |
)
|
| 192 |
torch.set_default_dtype(torch.float32)
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
|
| 2 |
+
You are a helpful assistant.<|im_end|>
|
| 3 |
+
{% endif %}<|im_start|>{{ message['role'] }}
|
| 4 |
+
{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
|
| 5 |
+
{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
|
| 6 |
+
{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
|
| 7 |
+
{% endif %}
|
config.json
CHANGED
|
@@ -1,13 +1,12 @@
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
"enable_audio_output": true,
|
| 6 |
"enable_talker": true,
|
| 7 |
"model_type": "qwen2_5_omni",
|
| 8 |
"talker_config": {
|
| 9 |
"_attn_implementation_autoset": true,
|
| 10 |
-
"_name_or_path": "Qwen2.5-Omni-7B/talker",
|
| 11 |
"architectures": [
|
| 12 |
"Qwen2OmniTalkerForConditionalGeneration"
|
| 13 |
],
|
|
@@ -61,13 +60,11 @@
|
|
| 61 |
},
|
| 62 |
"thinker_config": {
|
| 63 |
"_attn_implementation_autoset": true,
|
| 64 |
-
"_name_or_path": "Qwen2.5-Omni-7B/thinker",
|
| 65 |
"architectures": [
|
| 66 |
"Qwen2OmniNaViTThinkerForConditionalGeneration"
|
| 67 |
],
|
| 68 |
"audio_config": {
|
| 69 |
"_attn_implementation_autoset": true,
|
| 70 |
-
"_name_or_path": "",
|
| 71 |
"activation_dropout": 0.0,
|
| 72 |
"activation_function": "gelu",
|
| 73 |
"add_cross_attention": false,
|
|
@@ -99,6 +96,7 @@
|
|
| 99 |
"1": "LABEL_1"
|
| 100 |
},
|
| 101 |
"init_std": 0.02,
|
|
|
|
| 102 |
"is_decoder": false,
|
| 103 |
"is_encoder_decoder": false,
|
| 104 |
"label2id": {
|
|
@@ -153,13 +151,13 @@
|
|
| 153 |
"ignore_index": -100,
|
| 154 |
"image_token_index": 151655,
|
| 155 |
"init_std": 0.02,
|
|
|
|
| 156 |
"model_type": "qwen2_5_omni_thinker",
|
| 157 |
"pad_token_id": 151643,
|
| 158 |
"position_id_per_seconds": 25,
|
| 159 |
"seconds_per_chunk": 2,
|
| 160 |
"text_config": {
|
| 161 |
"_attn_implementation_autoset": false,
|
| 162 |
-
"_name_or_path": "",
|
| 163 |
"add_cross_attention": false,
|
| 164 |
"architectures": null,
|
| 165 |
"attention_dropout": 0.0,
|
|
@@ -185,6 +183,7 @@
|
|
| 185 |
"1": "LABEL_1"
|
| 186 |
},
|
| 187 |
"init_std": 0.02,
|
|
|
|
| 188 |
"intermediate_size": 32,
|
| 189 |
"is_decoder": false,
|
| 190 |
"is_encoder_decoder": false,
|
|
@@ -251,7 +250,6 @@
|
|
| 251 |
"video_token_index": 151656,
|
| 252 |
"vision_config": {
|
| 253 |
"_attn_implementation_autoset": true,
|
| 254 |
-
"_name_or_path": "",
|
| 255 |
"add_cross_attention": false,
|
| 256 |
"architectures": null,
|
| 257 |
"bad_words_ids": null,
|
|
@@ -283,6 +281,7 @@
|
|
| 283 |
"in_channels": 3,
|
| 284 |
"in_chans": 3,
|
| 285 |
"init_std": 0.02,
|
|
|
|
| 286 |
"intermediate_size": 32,
|
| 287 |
"is_decoder": false,
|
| 288 |
"is_encoder_decoder": false,
|
|
@@ -340,7 +339,6 @@
|
|
| 340 |
"_attn_implementation_autoset": true,
|
| 341 |
"bigvgan_config": {
|
| 342 |
"_attn_implementation_autoset": true,
|
| 343 |
-
"_name_or_path": "",
|
| 344 |
"add_cross_attention": false,
|
| 345 |
"architectures": null,
|
| 346 |
"bad_words_ids": null,
|
|
@@ -425,7 +423,6 @@
|
|
| 425 |
},
|
| 426 |
"dit_config": {
|
| 427 |
"_attn_implementation_autoset": true,
|
| 428 |
-
"_name_or_path": "",
|
| 429 |
"add_cross_attention": false,
|
| 430 |
"architectures": null,
|
| 431 |
"bad_words_ids": null,
|
|
@@ -534,5 +531,5 @@
|
|
| 534 |
"model_type": "qwen2_5_omni_token2wav"
|
| 535 |
},
|
| 536 |
"torch_dtype": "bfloat16",
|
| 537 |
-
"transformers_version": "4.
|
| 538 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
+
"Qwen2_5OmniForConditionalGeneration"
|
| 4 |
],
|
| 5 |
"enable_audio_output": true,
|
| 6 |
"enable_talker": true,
|
| 7 |
"model_type": "qwen2_5_omni",
|
| 8 |
"talker_config": {
|
| 9 |
"_attn_implementation_autoset": true,
|
|
|
|
| 10 |
"architectures": [
|
| 11 |
"Qwen2OmniTalkerForConditionalGeneration"
|
| 12 |
],
|
|
|
|
| 60 |
},
|
| 61 |
"thinker_config": {
|
| 62 |
"_attn_implementation_autoset": true,
|
|
|
|
| 63 |
"architectures": [
|
| 64 |
"Qwen2OmniNaViTThinkerForConditionalGeneration"
|
| 65 |
],
|
| 66 |
"audio_config": {
|
| 67 |
"_attn_implementation_autoset": true,
|
|
|
|
| 68 |
"activation_dropout": 0.0,
|
| 69 |
"activation_function": "gelu",
|
| 70 |
"add_cross_attention": false,
|
|
|
|
| 96 |
"1": "LABEL_1"
|
| 97 |
},
|
| 98 |
"init_std": 0.02,
|
| 99 |
+
"initializer_range": 0.02,
|
| 100 |
"is_decoder": false,
|
| 101 |
"is_encoder_decoder": false,
|
| 102 |
"label2id": {
|
|
|
|
| 151 |
"ignore_index": -100,
|
| 152 |
"image_token_index": 151655,
|
| 153 |
"init_std": 0.02,
|
| 154 |
+
"initializer_range": 0.02,
|
| 155 |
"model_type": "qwen2_5_omni_thinker",
|
| 156 |
"pad_token_id": 151643,
|
| 157 |
"position_id_per_seconds": 25,
|
| 158 |
"seconds_per_chunk": 2,
|
| 159 |
"text_config": {
|
| 160 |
"_attn_implementation_autoset": false,
|
|
|
|
| 161 |
"add_cross_attention": false,
|
| 162 |
"architectures": null,
|
| 163 |
"attention_dropout": 0.0,
|
|
|
|
| 183 |
"1": "LABEL_1"
|
| 184 |
},
|
| 185 |
"init_std": 0.02,
|
| 186 |
+
"initializer_range": 0.02,
|
| 187 |
"intermediate_size": 32,
|
| 188 |
"is_decoder": false,
|
| 189 |
"is_encoder_decoder": false,
|
|
|
|
| 250 |
"video_token_index": 151656,
|
| 251 |
"vision_config": {
|
| 252 |
"_attn_implementation_autoset": true,
|
|
|
|
| 253 |
"add_cross_attention": false,
|
| 254 |
"architectures": null,
|
| 255 |
"bad_words_ids": null,
|
|
|
|
| 281 |
"in_channels": 3,
|
| 282 |
"in_chans": 3,
|
| 283 |
"init_std": 0.02,
|
| 284 |
+
"initializer_range": 0.02,
|
| 285 |
"intermediate_size": 32,
|
| 286 |
"is_decoder": false,
|
| 287 |
"is_encoder_decoder": false,
|
|
|
|
| 339 |
"_attn_implementation_autoset": true,
|
| 340 |
"bigvgan_config": {
|
| 341 |
"_attn_implementation_autoset": true,
|
|
|
|
| 342 |
"add_cross_attention": false,
|
| 343 |
"architectures": null,
|
| 344 |
"bad_words_ids": null,
|
|
|
|
| 423 |
},
|
| 424 |
"dit_config": {
|
| 425 |
"_attn_implementation_autoset": true,
|
|
|
|
| 426 |
"add_cross_attention": false,
|
| 427 |
"architectures": null,
|
| 428 |
"bad_words_ids": null,
|
|
|
|
| 531 |
"model_type": "qwen2_5_omni_token2wav"
|
| 532 |
},
|
| 533 |
"torch_dtype": "bfloat16",
|
| 534 |
+
"transformers_version": "4.52.0.dev0"
|
| 535 |
}
|
generation_config.json
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
{
|
| 2 |
"_from_model_config": true,
|
| 3 |
-
"transformers_version": "4.
|
| 4 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"_from_model_config": true,
|
| 3 |
+
"transformers_version": "4.52.0.dev0"
|
| 4 |
}
|
tokenizer_config.json
CHANGED
|
@@ -197,7 +197,6 @@
|
|
| 197 |
"audio_eos_token": "<|audio_eos|>",
|
| 198 |
"audio_token": "<|AUDIO|>",
|
| 199 |
"bos_token": null,
|
| 200 |
-
"chat_template": "{% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}",
|
| 201 |
"clean_up_tokenization_spaces": false,
|
| 202 |
"eos_token": "<|im_end|>",
|
| 203 |
"errors": "replace",
|
|
|
|
| 197 |
"audio_eos_token": "<|audio_eos|>",
|
| 198 |
"audio_token": "<|AUDIO|>",
|
| 199 |
"bos_token": null,
|
|
|
|
| 200 |
"clean_up_tokenization_spaces": false,
|
| 201 |
"eos_token": "<|im_end|>",
|
| 202 |
"errors": "replace",
|