wolfofbackstreet commited on
Commit
dbe5cb4
·
verified ·
1 Parent(s): e5d3d3c

Add 4-bit quantized model

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|AUDIO|>": 151646,
5
+ "<|IMAGE|>": 151655,
6
+ "<|VIDEO|>": 151656,
7
+ "<|audio_bos|>": 151647,
8
+ "<|audio_eos|>": 151648,
9
+ "<|box_end|>": 151649,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|quad_end|>": 151651,
19
+ "<|quad_start|>": 151650,
20
+ "<|repo_name|>": 151663,
21
+ "<|vision_bos|>": 151652,
22
+ "<|vision_eos|>": 151653,
23
+ "<|vision_pad|>": 151654
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set audio_count = namespace(value=0) %}{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_bos|><|IMAGE|><|vision_eos|>{% elif content['type'] == 'audio' or 'audio' in content or 'audio_url' in content %}{% set audio_count.value = audio_count.value + 1 %}{% if add_audio_id %}Audio {{ audio_count.value }}: {% endif %}<|audio_bos|><|AUDIO|><|audio_eos|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_bos|><|VIDEO|><|vision_eos|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": true,
3
+ "architectures": [
4
+ "Qwen2_5OmniModel"
5
+ ],
6
+ "enable_audio_output": true,
7
+ "enable_talker": true,
8
+ "model_type": "qwen2_5_omni",
9
+ "talker_config": {
10
+ "_attn_implementation_autoset": true,
11
+ "architectures": [
12
+ "Qwen2OmniTalkerForConditionalGeneration"
13
+ ],
14
+ "attention_dropout": 0.0,
15
+ "audio_end_token_id": 151648,
16
+ "audio_start_token_id": 151647,
17
+ "audio_token_index": 151646,
18
+ "embedding_size": 2048,
19
+ "head_dim": 64,
20
+ "hidden_act": "silu",
21
+ "hidden_size": 896,
22
+ "image_token_index": 151655,
23
+ "init_std": 0.02,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 4864,
26
+ "max_position_embeddings": 32768,
27
+ "max_window_layers": 28,
28
+ "model_type": "qwen2_5_omni_talker",
29
+ "num_attention_heads": 14,
30
+ "num_hidden_layers": 24,
31
+ "num_key_value_heads": 2,
32
+ "position_id_per_seconds": 25,
33
+ "rms_norm_eps": 1e-06,
34
+ "rope_scaling": {
35
+ "mrope_section": [
36
+ 16,
37
+ 16,
38
+ 0
39
+ ],
40
+ "rope_type": "default",
41
+ "type": "default"
42
+ },
43
+ "rope_theta": 1000000.0,
44
+ "seconds_per_chunk": 2,
45
+ "sliding_window": 32768,
46
+ "spatial_merge_size": 2,
47
+ "torch_dtype": "float16",
48
+ "tts_codec_end_token_id": 8294,
49
+ "tts_codec_mask_token_id": 8296,
50
+ "tts_codec_pad_token_id": 8292,
51
+ "tts_codec_start_token_id": 8293,
52
+ "tts_text_end_token_id": 151861,
53
+ "tts_text_pad_token_id": 151859,
54
+ "tts_text_start_token_id": 151860,
55
+ "use_cache": true,
56
+ "use_sliding_window": false,
57
+ "video_token_index": 151656,
58
+ "vision_end_token_id": 151653,
59
+ "vision_start_token_id": 151652,
60
+ "vocab_size": 8448
61
+ },
62
+ "thinker_config": {
63
+ "_attn_implementation_autoset": true,
64
+ "architectures": [
65
+ "Qwen2OmniNaViTThinkerForConditionalGeneration"
66
+ ],
67
+ "audio_config": {
68
+ "_attn_implementation_autoset": true,
69
+ "activation_dropout": 0.0,
70
+ "activation_function": "gelu",
71
+ "add_cross_attention": false,
72
+ "architectures": null,
73
+ "attention_dropout": 0.0,
74
+ "bad_words_ids": null,
75
+ "begin_suppress_tokens": null,
76
+ "bos_token_id": null,
77
+ "chunk_size_feed_forward": 0,
78
+ "cross_attention_hidden_size": null,
79
+ "d_model": 1280,
80
+ "decoder_start_token_id": null,
81
+ "diversity_penalty": 0.0,
82
+ "do_sample": false,
83
+ "dropout": 0.0,
84
+ "early_stopping": false,
85
+ "encoder_attention_heads": 20,
86
+ "encoder_ffn_dim": 5120,
87
+ "encoder_layerdrop": 0.0,
88
+ "encoder_layers": 32,
89
+ "encoder_no_repeat_ngram_size": 0,
90
+ "eos_token_id": null,
91
+ "exponential_decay_length_penalty": null,
92
+ "finetuning_task": null,
93
+ "forced_bos_token_id": null,
94
+ "forced_eos_token_id": null,
95
+ "id2label": {
96
+ "0": "LABEL_0",
97
+ "1": "LABEL_1"
98
+ },
99
+ "init_std": 0.02,
100
+ "initializer_range": 0.02,
101
+ "is_decoder": false,
102
+ "is_encoder_decoder": false,
103
+ "label2id": {
104
+ "LABEL_0": 0,
105
+ "LABEL_1": 1
106
+ },
107
+ "length_penalty": 1.0,
108
+ "max_length": 20,
109
+ "max_source_positions": 1500,
110
+ "min_length": 0,
111
+ "model_type": "qwen2_5_omni_audio_encoder",
112
+ "n_window": 100,
113
+ "no_repeat_ngram_size": 0,
114
+ "num_beam_groups": 1,
115
+ "num_beams": 1,
116
+ "num_hidden_layers": 32,
117
+ "num_mel_bins": 128,
118
+ "num_return_sequences": 1,
119
+ "output_attentions": false,
120
+ "output_dim": 2048,
121
+ "output_hidden_states": false,
122
+ "output_scores": false,
123
+ "pad_token_id": null,
124
+ "prefix": null,
125
+ "problem_type": null,
126
+ "pruned_heads": {},
127
+ "remove_invalid_values": false,
128
+ "repetition_penalty": 1.0,
129
+ "return_dict": true,
130
+ "return_dict_in_generate": false,
131
+ "scale_embedding": false,
132
+ "sep_token_id": null,
133
+ "suppress_tokens": null,
134
+ "task_specific_params": null,
135
+ "temperature": 1.0,
136
+ "tf_legacy_loss": false,
137
+ "tie_encoder_decoder": false,
138
+ "tie_word_embeddings": true,
139
+ "tokenizer_class": null,
140
+ "top_k": 50,
141
+ "top_p": 1.0,
142
+ "torch_dtype": null,
143
+ "torchscript": false,
144
+ "typical_p": 1.0,
145
+ "use_bfloat16": false
146
+ },
147
+ "audio_end_token_id": 151648,
148
+ "audio_start_token_id": 151647,
149
+ "audio_token_index": 151646,
150
+ "bos_token_id": 151644,
151
+ "eos_token_id": 151645,
152
+ "ignore_index": -100,
153
+ "image_token_index": 151655,
154
+ "init_std": 0.02,
155
+ "initializer_range": 0.02,
156
+ "model_type": "qwen2_5_omni_thinker",
157
+ "pad_token_id": 151643,
158
+ "position_id_per_seconds": 25,
159
+ "seconds_per_chunk": 2,
160
+ "text_config": {
161
+ "_attn_implementation_autoset": false,
162
+ "add_cross_attention": false,
163
+ "architectures": null,
164
+ "attention_dropout": 0.0,
165
+ "bad_words_ids": null,
166
+ "begin_suppress_tokens": null,
167
+ "bos_token_id": null,
168
+ "chunk_size_feed_forward": 0,
169
+ "cross_attention_hidden_size": null,
170
+ "decoder_start_token_id": null,
171
+ "diversity_penalty": 0.0,
172
+ "do_sample": false,
173
+ "early_stopping": false,
174
+ "encoder_no_repeat_ngram_size": 0,
175
+ "eos_token_id": null,
176
+ "exponential_decay_length_penalty": null,
177
+ "finetuning_task": null,
178
+ "forced_bos_token_id": null,
179
+ "forced_eos_token_id": null,
180
+ "hidden_act": "silu",
181
+ "hidden_size": 2048,
182
+ "id2label": {
183
+ "0": "LABEL_0",
184
+ "1": "LABEL_1"
185
+ },
186
+ "init_std": 0.02,
187
+ "initializer_range": 0.02,
188
+ "intermediate_size": 11008,
189
+ "is_decoder": false,
190
+ "is_encoder_decoder": false,
191
+ "label2id": {
192
+ "LABEL_0": 0,
193
+ "LABEL_1": 1
194
+ },
195
+ "length_penalty": 1.0,
196
+ "max_length": 20,
197
+ "max_position_embeddings": 32768,
198
+ "max_window_layers": 70,
199
+ "min_length": 0,
200
+ "model_type": "qwen2_5_omni_text",
201
+ "no_repeat_ngram_size": 0,
202
+ "num_attention_heads": 16,
203
+ "num_beam_groups": 1,
204
+ "num_beams": 1,
205
+ "num_hidden_layers": 36,
206
+ "num_key_value_heads": 2,
207
+ "num_return_sequences": 1,
208
+ "output_attentions": false,
209
+ "output_hidden_states": false,
210
+ "output_scores": false,
211
+ "pad_token_id": null,
212
+ "prefix": null,
213
+ "problem_type": null,
214
+ "pruned_heads": {},
215
+ "remove_invalid_values": false,
216
+ "repetition_penalty": 1.0,
217
+ "return_dict": true,
218
+ "return_dict_in_generate": false,
219
+ "rms_norm_eps": 1e-06,
220
+ "rope_scaling": {
221
+ "mrope_section": [
222
+ 16,
223
+ 24,
224
+ 24
225
+ ],
226
+ "rope_type": "default",
227
+ "type": "default"
228
+ },
229
+ "rope_theta": 1000000.0,
230
+ "sep_token_id": null,
231
+ "sliding_window": 32768,
232
+ "suppress_tokens": null,
233
+ "task_specific_params": null,
234
+ "temperature": 1.0,
235
+ "tf_legacy_loss": false,
236
+ "tie_encoder_decoder": false,
237
+ "tie_word_embeddings": false,
238
+ "tokenizer_class": null,
239
+ "top_k": 50,
240
+ "top_p": 1.0,
241
+ "torch_dtype": null,
242
+ "torchscript": false,
243
+ "typical_p": 1.0,
244
+ "use_bfloat16": false,
245
+ "use_cache": true,
246
+ "use_sliding_window": false,
247
+ "vocab_size": 151936
248
+ },
249
+ "torch_dtype": "float16",
250
+ "user_token_id": 872,
251
+ "video_token_index": 151656,
252
+ "vision_config": {
253
+ "_attn_implementation_autoset": true,
254
+ "add_cross_attention": false,
255
+ "architectures": null,
256
+ "bad_words_ids": null,
257
+ "begin_suppress_tokens": null,
258
+ "bos_token_id": null,
259
+ "chunk_size_feed_forward": 0,
260
+ "cross_attention_hidden_size": null,
261
+ "decoder_start_token_id": null,
262
+ "depth": 32,
263
+ "diversity_penalty": 0.0,
264
+ "do_sample": false,
265
+ "early_stopping": false,
266
+ "embed_dim": 1280,
267
+ "encoder_no_repeat_ngram_size": 0,
268
+ "eos_token_id": null,
269
+ "exponential_decay_length_penalty": null,
270
+ "finetuning_task": null,
271
+ "forced_bos_token_id": null,
272
+ "forced_eos_token_id": null,
273
+ "fullatt_block_indexes": [
274
+ 7,
275
+ 15,
276
+ 23,
277
+ 31
278
+ ],
279
+ "hidden_act": "silu",
280
+ "hidden_size": 1280,
281
+ "id2label": {
282
+ "0": "LABEL_0",
283
+ "1": "LABEL_1"
284
+ },
285
+ "in_channels": 3,
286
+ "in_chans": 3,
287
+ "init_std": 0.02,
288
+ "initializer_range": 0.02,
289
+ "intermediate_size": 3420,
290
+ "is_decoder": false,
291
+ "is_encoder_decoder": false,
292
+ "label2id": {
293
+ "LABEL_0": 0,
294
+ "LABEL_1": 1
295
+ },
296
+ "length_penalty": 1.0,
297
+ "max_length": 20,
298
+ "min_length": 0,
299
+ "model_type": "qwen2_5_omni_vision_encoder",
300
+ "no_repeat_ngram_size": 0,
301
+ "num_beam_groups": 1,
302
+ "num_beams": 1,
303
+ "num_heads": 16,
304
+ "num_return_sequences": 1,
305
+ "out_hidden_size": 2048,
306
+ "output_attentions": false,
307
+ "output_hidden_states": false,
308
+ "output_scores": false,
309
+ "pad_token_id": null,
310
+ "patch_size": 14,
311
+ "prefix": null,
312
+ "problem_type": null,
313
+ "pruned_heads": {},
314
+ "remove_invalid_values": false,
315
+ "repetition_penalty": 1.0,
316
+ "return_dict": true,
317
+ "return_dict_in_generate": false,
318
+ "sep_token_id": null,
319
+ "spatial_merge_size": 2,
320
+ "spatial_patch_size": 14,
321
+ "suppress_tokens": null,
322
+ "task_specific_params": null,
323
+ "temperature": 1.0,
324
+ "temporal_patch_size": 2,
325
+ "tf_legacy_loss": false,
326
+ "tie_encoder_decoder": false,
327
+ "tie_word_embeddings": true,
328
+ "tokenizer_class": null,
329
+ "tokens_per_second": 25,
330
+ "top_k": 50,
331
+ "top_p": 1.0,
332
+ "torch_dtype": null,
333
+ "torchscript": false,
334
+ "typical_p": 1.0,
335
+ "use_bfloat16": false,
336
+ "window_size": 112
337
+ },
338
+ "vision_end_token_id": 151653,
339
+ "vision_start_token_id": 151652,
340
+ "vision_token_id": 151654
341
+ },
342
+ "token2wav_config": {
343
+ "_attn_implementation_autoset": true,
344
+ "bigvgan_config": {
345
+ "_attn_implementation_autoset": true,
346
+ "add_cross_attention": false,
347
+ "architectures": null,
348
+ "bad_words_ids": null,
349
+ "begin_suppress_tokens": null,
350
+ "bos_token_id": null,
351
+ "chunk_size_feed_forward": 0,
352
+ "cross_attention_hidden_size": null,
353
+ "decoder_start_token_id": null,
354
+ "diversity_penalty": 0.0,
355
+ "do_sample": false,
356
+ "early_stopping": false,
357
+ "encoder_no_repeat_ngram_size": 0,
358
+ "eos_token_id": null,
359
+ "exponential_decay_length_penalty": null,
360
+ "finetuning_task": null,
361
+ "forced_bos_token_id": null,
362
+ "forced_eos_token_id": null,
363
+ "id2label": {
364
+ "0": "LABEL_0",
365
+ "1": "LABEL_1"
366
+ },
367
+ "is_decoder": false,
368
+ "is_encoder_decoder": false,
369
+ "label2id": {
370
+ "LABEL_0": 0,
371
+ "LABEL_1": 1
372
+ },
373
+ "length_penalty": 1.0,
374
+ "max_length": 20,
375
+ "mel_dim": 80,
376
+ "min_length": 0,
377
+ "model_type": "qwen2_5_omni_bigvgan",
378
+ "no_repeat_ngram_size": 0,
379
+ "num_beam_groups": 1,
380
+ "num_beams": 1,
381
+ "num_return_sequences": 1,
382
+ "output_attentions": false,
383
+ "output_hidden_states": false,
384
+ "output_scores": false,
385
+ "pad_token_id": null,
386
+ "prefix": null,
387
+ "problem_type": null,
388
+ "pruned_heads": {},
389
+ "remove_invalid_values": false,
390
+ "repetition_penalty": 1.0,
391
+ "resblock_dilation_sizes": [
392
+ [
393
+ 1,
394
+ 3,
395
+ 5
396
+ ],
397
+ [
398
+ 1,
399
+ 3,
400
+ 5
401
+ ],
402
+ [
403
+ 1,
404
+ 3,
405
+ 5
406
+ ]
407
+ ],
408
+ "resblock_kernel_sizes": [
409
+ 3,
410
+ 7,
411
+ 11
412
+ ],
413
+ "return_dict": true,
414
+ "return_dict_in_generate": false,
415
+ "sep_token_id": null,
416
+ "suppress_tokens": null,
417
+ "task_specific_params": null,
418
+ "temperature": 1.0,
419
+ "tf_legacy_loss": false,
420
+ "tie_encoder_decoder": false,
421
+ "tie_word_embeddings": true,
422
+ "tokenizer_class": null,
423
+ "top_k": 50,
424
+ "top_p": 1.0,
425
+ "torch_dtype": null,
426
+ "torchscript": false,
427
+ "typical_p": 1.0,
428
+ "upsample_initial_channel": 1536,
429
+ "upsample_kernel_sizes": [
430
+ 11,
431
+ 7,
432
+ 4,
433
+ 4,
434
+ 4,
435
+ 4
436
+ ],
437
+ "upsample_rates": [
438
+ 5,
439
+ 3,
440
+ 2,
441
+ 2,
442
+ 2,
443
+ 2
444
+ ],
445
+ "use_bfloat16": false,
446
+ "use_bias_at_final": false
447
+ },
448
+ "dit_config": {
449
+ "_attn_implementation_autoset": true,
450
+ "add_cross_attention": false,
451
+ "architectures": null,
452
+ "bad_words_ids": null,
453
+ "begin_suppress_tokens": null,
454
+ "block_size": 24,
455
+ "bos_token_id": null,
456
+ "chunk_size_feed_forward": 0,
457
+ "cross_attention_hidden_size": null,
458
+ "decoder_start_token_id": null,
459
+ "depth": 22,
460
+ "dim": 1024,
461
+ "diversity_penalty": 0.0,
462
+ "do_sample": false,
463
+ "dropout": 0.1,
464
+ "early_stopping": false,
465
+ "emb_dim": 512,
466
+ "enc_attention_channels": 64,
467
+ "enc_channels": [
468
+ 256,
469
+ 256,
470
+ 256,
471
+ 256,
472
+ 768
473
+ ],
474
+ "enc_dilations": [
475
+ 1,
476
+ 2,
477
+ 3,
478
+ 4,
479
+ 1
480
+ ],
481
+ "enc_dim": 128,
482
+ "enc_emb_dim": 192,
483
+ "enc_global_context": true,
484
+ "enc_kernel_sizes": [
485
+ 5,
486
+ 3,
487
+ 3,
488
+ 3,
489
+ 1
490
+ ],
491
+ "enc_lin_neurons": 192,
492
+ "enc_res2net_scale": 2,
493
+ "enc_se_channels": 64,
494
+ "encoder_no_repeat_ngram_size": 0,
495
+ "eos_token_id": null,
496
+ "exponential_decay_length_penalty": null,
497
+ "ff_mult": 2,
498
+ "finetuning_task": null,
499
+ "forced_bos_token_id": null,
500
+ "forced_eos_token_id": null,
501
+ "head_dim": 64,
502
+ "heads": 16,
503
+ "hidden_size": 1024,
504
+ "id2label": {
505
+ "0": "LABEL_0",
506
+ "1": "LABEL_1"
507
+ },
508
+ "is_decoder": false,
509
+ "is_encoder_decoder": false,
510
+ "label2id": {
511
+ "LABEL_0": 0,
512
+ "LABEL_1": 1
513
+ },
514
+ "length_penalty": 1.0,
515
+ "look_ahead_layers": [
516
+ 10
517
+ ],
518
+ "look_backward_layers": [
519
+ 0,
520
+ 20
521
+ ],
522
+ "max_length": 20,
523
+ "max_position_embeddings": 32768,
524
+ "mel_dim": 80,
525
+ "min_length": 0,
526
+ "model_type": "qwen2_5_omni_dit",
527
+ "no_repeat_ngram_size": 0,
528
+ "num_attention_heads": 16,
529
+ "num_beam_groups": 1,
530
+ "num_beams": 1,
531
+ "num_embeds": 8193,
532
+ "num_hidden_layers": 22,
533
+ "num_return_sequences": 1,
534
+ "output_attentions": false,
535
+ "output_hidden_states": false,
536
+ "output_scores": false,
537
+ "pad_token_id": null,
538
+ "prefix": null,
539
+ "problem_type": null,
540
+ "pruned_heads": {},
541
+ "remove_invalid_values": false,
542
+ "repeats": 2,
543
+ "repetition_penalty": 1.0,
544
+ "return_dict": true,
545
+ "return_dict_in_generate": false,
546
+ "rope_theta": 10000.0,
547
+ "sep_token_id": null,
548
+ "suppress_tokens": null,
549
+ "task_specific_params": null,
550
+ "temperature": 1.0,
551
+ "tf_legacy_loss": false,
552
+ "tie_encoder_decoder": false,
553
+ "tie_word_embeddings": true,
554
+ "tokenizer_class": null,
555
+ "top_k": 50,
556
+ "top_p": 1.0,
557
+ "torch_dtype": "float32",
558
+ "torchscript": false,
559
+ "typical_p": 1.0,
560
+ "use_bfloat16": false
561
+ },
562
+ "model_type": "qwen2_5_omni_token2wav",
563
+ "torch_dtype": "float16"
564
+ },
565
+ "torch_dtype": "float16",
566
+ "transformers_version": "4.52.0.dev0"
567
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
openvino_token2wav_bigvgan_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e136207f00108aa77e2dfc8af408e3b7826c86dfcf3c32ec3b1078924aca233
3
+ size 230959656
openvino_token2wav_bigvgan_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
openvino_token2wav_dit_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cee861e1bf6a31dc342838e819be18747a417c0d65added8b9272a31b41c190d
3
+ size 667216186
openvino_token2wav_dit_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 300,
3
+ "dither": 0.0,
4
+ "feature_extractor_type": "WhisperFeatureExtractor",
5
+ "feature_size": 128,
6
+ "hop_length": 160,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_processor_type": "Qwen2VLImageProcessor",
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "max_pixels": 12845056,
19
+ "merge_size": 2,
20
+ "min_pixels": 3136,
21
+ "n_fft": 400,
22
+ "n_samples": 4800000,
23
+ "nb_max_frames": 30000,
24
+ "padding_side": "right",
25
+ "padding_value": 0.0,
26
+ "patch_size": 14,
27
+ "processor_class": "Qwen2_5OmniProcessor",
28
+ "return_attention_mask": true,
29
+ "sampling_rate": 16000,
30
+ "temporal_patch_size": 2
31
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|AUDIO|>",
6
+ "<|audio_bos|>",
7
+ "<|audio_eos|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_bos|>",
12
+ "<|vision_eos|>",
13
+ "<|vision_pad|>",
14
+ "<|IMAGE|>",
15
+ "<|VIDEO|>"
16
+ ],
17
+ "audio_bos_token": "<|audio_bos|>",
18
+ "audio_eos_token": "<|audio_eos|>",
19
+ "audio_token": "<|AUDIO|>",
20
+ "eos_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "image_token": "<|IMAGE|>",
28
+ "pad_token": {
29
+ "content": "<|endoftext|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "video_token": "<|VIDEO|>",
36
+ "vision_bos_token": "<|vision_bos|>",
37
+ "vision_eos_token": "<|vision_eos|>"
38
+ }
spk_dict.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a05609b28f5d42b7b748f0f07592545c8f1f6885b9ae8fff64baf56e86b2a18
3
+ size 259544
talker/openvino_talker_embedding_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbd8598119993f4f77dc244e7b0c9dc65fa15f6aee29d405bd51b132fa5a2011
3
+ size 34603012
talker/openvino_talker_embedding_model.xml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model540" version="11">
3
+ <layers>
4
+ <layer id="0" name="input" type="Parameter" version="opset1">
5
+ <data shape="?,?" element_type="i64" />
6
+ <output>
7
+ <port id="0" precision="I64" names="input">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ </port>
11
+ </output>
12
+ </layer>
13
+ <layer id="1" name="self.weight" type="Const" version="opset1">
14
+ <data element_type="f16" shape="8448, 2048" offset="0" size="34603008" />
15
+ <output>
16
+ <port id="0" precision="FP16" names="self.weight">
17
+ <dim>8448</dim>
18
+ <dim>2048</dim>
19
+ </port>
20
+ </output>
21
+ </layer>
22
+ <layer id="2" name="ov_ext::embedding/Convert" type="Convert" version="opset1">
23
+ <data destination_type="f32" />
24
+ <rt_info>
25
+ <attribute name="decompression" version="0" />
26
+ </rt_info>
27
+ <input>
28
+ <port id="0" precision="FP16">
29
+ <dim>8448</dim>
30
+ <dim>2048</dim>
31
+ </port>
32
+ </input>
33
+ <output>
34
+ <port id="1" precision="FP32">
35
+ <dim>8448</dim>
36
+ <dim>2048</dim>
37
+ </port>
38
+ </output>
39
+ </layer>
40
+ <layer id="3" name="ov_ext::embedding/Convert_1" type="Convert" version="opset1">
41
+ <data destination_type="i32" />
42
+ <input>
43
+ <port id="0" precision="I64">
44
+ <dim>-1</dim>
45
+ <dim>-1</dim>
46
+ </port>
47
+ </input>
48
+ <output>
49
+ <port id="1" precision="I32">
50
+ <dim>-1</dim>
51
+ <dim>-1</dim>
52
+ </port>
53
+ </output>
54
+ </layer>
55
+ <layer id="4" name="ov_ext::embedding/Constant" type="Const" version="opset1">
56
+ <data element_type="i32" shape="" offset="34603008" size="4" />
57
+ <output>
58
+ <port id="0" precision="I32" />
59
+ </output>
60
+ </layer>
61
+ <layer id="5" name="ov_ext::embedding/Gather" type="Gather" version="opset8">
62
+ <data batch_dims="0" />
63
+ <input>
64
+ <port id="0" precision="FP32">
65
+ <dim>8448</dim>
66
+ <dim>2048</dim>
67
+ </port>
68
+ <port id="1" precision="I32">
69
+ <dim>-1</dim>
70
+ <dim>-1</dim>
71
+ </port>
72
+ <port id="2" precision="I32" />
73
+ </input>
74
+ <output>
75
+ <port id="3" precision="FP32">
76
+ <dim>-1</dim>
77
+ <dim>-1</dim>
78
+ <dim>2048</dim>
79
+ </port>
80
+ </output>
81
+ </layer>
82
+ <layer id="6" name="Result_1395264" type="Result" version="opset1">
83
+ <input>
84
+ <port id="0" precision="FP32">
85
+ <dim>-1</dim>
86
+ <dim>-1</dim>
87
+ <dim>2048</dim>
88
+ </port>
89
+ </input>
90
+ </layer>
91
+ </layers>
92
+ <edges>
93
+ <edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
94
+ <edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
95
+ <edge from-layer="2" from-port="1" to-layer="5" to-port="0" />
96
+ <edge from-layer="3" from-port="1" to-layer="5" to-port="1" />
97
+ <edge from-layer="4" from-port="0" to-layer="5" to-port="2" />
98
+ <edge from-layer="5" from-port="3" to-layer="6" to-port="0" />
99
+ </edges>
100
+ <rt_info>
101
+ <Runtime_version value="2025.1.0-18503-6fec06580ab-releases/2025/1" />
102
+ <conversion_parameters>
103
+ <framework value="pytorch" />
104
+ <is_python_object value="True" />
105
+ </conversion_parameters>
106
+ </rt_info>
107
+ </net>
talker/openvino_talker_language_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6c9d314bd535bc242a5c00ff3d3c0e2297c5d0cb5b1d3fa94a982c6a946e540
3
+ size 230992332
talker/openvino_talker_language_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
thinker/openvino_thinker_audio_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:088e30f86b9c0bb9af6f80948c1a7a2542f0daa3406e9f1deab5c9b729819399
3
+ size 1273932968
thinker/openvino_thinker_audio_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
thinker/openvino_thinker_audio_state_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c5cc6e3ab3076b77918d59d4d0c92851a07f8f7b082f58d6eb6765b66220584
3
+ size 5252116
thinker/openvino_thinker_audio_state_model.xml ADDED
@@ -0,0 +1,372 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model6" version="11">
3
+ <layers>
4
+ <layer id="0" name="each_audio_states" type="Parameter" version="opset1">
5
+ <data shape="?,?" element_type="f32" />
6
+ <output>
7
+ <port id="0" precision="FP32" names="each_audio_states">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ </port>
11
+ </output>
12
+ </layer>
13
+ <layer id="1" name="aten::transpose/Constant" type="Const" version="opset1">
14
+ <data element_type="i32" shape="2" offset="0" size="8" />
15
+ <output>
16
+ <port id="0" precision="I32">
17
+ <dim>2</dim>
18
+ </port>
19
+ </output>
20
+ </layer>
21
+ <layer id="2" name="aten::transpose/Transpose" type="Transpose" version="opset1">
22
+ <input>
23
+ <port id="0" precision="FP32">
24
+ <dim>-1</dim>
25
+ <dim>-1</dim>
26
+ </port>
27
+ <port id="1" precision="I32">
28
+ <dim>2</dim>
29
+ </port>
30
+ </input>
31
+ <output>
32
+ <port id="2" precision="FP32" names="7">
33
+ <dim>-1</dim>
34
+ <dim>-1</dim>
35
+ </port>
36
+ </output>
37
+ </layer>
38
+ <layer id="3" name="Constant_41972" type="Const" version="opset1">
39
+ <data element_type="i64" shape="1" offset="8" size="8" />
40
+ <output>
41
+ <port id="0" precision="I64">
42
+ <dim>1</dim>
43
+ </port>
44
+ </output>
45
+ </layer>
46
+ <layer id="4" name="__module.avg_pooler/aten::avg_pool1d/Unsqueeze" type="Unsqueeze" version="opset1">
47
+ <input>
48
+ <port id="0" precision="FP32">
49
+ <dim>-1</dim>
50
+ <dim>-1</dim>
51
+ </port>
52
+ <port id="1" precision="I64">
53
+ <dim>1</dim>
54
+ </port>
55
+ </input>
56
+ <output>
57
+ <port id="2" precision="FP32">
58
+ <dim>1</dim>
59
+ <dim>-1</dim>
60
+ <dim>-1</dim>
61
+ </port>
62
+ </output>
63
+ </layer>
64
+ <layer id="5" name="__module.avg_pooler/aten::avg_pool1d/AvgPool" type="AvgPool" version="opset14">
65
+ <data kernel="2" strides="2" pads_begin="0" pads_end="0" exclude-pad="false" auto_pad="explicit" rounding_type="floor" />
66
+ <input>
67
+ <port id="0" precision="FP32">
68
+ <dim>1</dim>
69
+ <dim>-1</dim>
70
+ <dim>-1</dim>
71
+ </port>
72
+ </input>
73
+ <output>
74
+ <port id="1" precision="FP32">
75
+ <dim>1</dim>
76
+ <dim>-1</dim>
77
+ <dim>-1</dim>
78
+ </port>
79
+ </output>
80
+ </layer>
81
+ <layer id="6" name="__module.avg_pooler/aten::avg_pool1d/Squeeze" type="Squeeze" version="opset1">
82
+ <input>
83
+ <port id="0" precision="FP32">
84
+ <dim>1</dim>
85
+ <dim>-1</dim>
86
+ <dim>-1</dim>
87
+ </port>
88
+ <port id="1" precision="I64">
89
+ <dim>1</dim>
90
+ </port>
91
+ </input>
92
+ <output>
93
+ <port id="2" precision="FP32" names="21_1">
94
+ <dim>-1</dim>
95
+ <dim>-1</dim>
96
+ </port>
97
+ </output>
98
+ </layer>
99
+ <layer id="7" name="aten::transpose_/Constant" type="Const" version="opset1">
100
+ <data element_type="i32" shape="2" offset="0" size="8" />
101
+ <output>
102
+ <port id="0" precision="I32">
103
+ <dim>2</dim>
104
+ </port>
105
+ </output>
106
+ </layer>
107
+ <layer id="8" name="aten::transpose_/Transpose" type="Transpose" version="opset1">
108
+ <input>
109
+ <port id="0" precision="FP32">
110
+ <dim>-1</dim>
111
+ <dim>-1</dim>
112
+ </port>
113
+ <port id="1" precision="I32">
114
+ <dim>2</dim>
115
+ </port>
116
+ </input>
117
+ <output>
118
+ <port id="2" precision="FP32" names="21">
119
+ <dim>-1</dim>
120
+ <dim>-1</dim>
121
+ </port>
122
+ </output>
123
+ </layer>
124
+ <layer id="9" name="__module.ln_post/aten::layer_norm/Multiply" type="Const" version="opset1">
125
+ <data element_type="i32" shape="1" offset="16" size="4" />
126
+ <output>
127
+ <port id="0" precision="I32">
128
+ <dim>1</dim>
129
+ </port>
130
+ </output>
131
+ </layer>
132
+ <layer id="10" name="__module.ln_post/aten::layer_norm/MVN" type="MVN" version="opset6">
133
+ <data eps="9.9999997473787516e-06" normalize_variance="true" eps_mode="INSIDE_SQRT" />
134
+ <input>
135
+ <port id="0" precision="FP32">
136
+ <dim>-1</dim>
137
+ <dim>-1</dim>
138
+ </port>
139
+ <port id="1" precision="I32">
140
+ <dim>1</dim>
141
+ </port>
142
+ </input>
143
+ <output>
144
+ <port id="2" precision="FP32">
145
+ <dim>-1</dim>
146
+ <dim>-1</dim>
147
+ </port>
148
+ </output>
149
+ </layer>
150
+ <layer id="11" name="Constant_43914_compressed" type="Const" version="opset1">
151
+ <data element_type="f16" shape="1, 1280" offset="20" size="2560" />
152
+ <output>
153
+ <port id="0" precision="FP16">
154
+ <dim>1</dim>
155
+ <dim>1280</dim>
156
+ </port>
157
+ </output>
158
+ </layer>
159
+ <layer id="12" name="Constant_43914" type="Convert" version="opset1">
160
+ <data destination_type="f32" />
161
+ <rt_info>
162
+ <attribute name="decompression" version="0" />
163
+ </rt_info>
164
+ <input>
165
+ <port id="0" precision="FP16">
166
+ <dim>1</dim>
167
+ <dim>1280</dim>
168
+ </port>
169
+ </input>
170
+ <output>
171
+ <port id="1" precision="FP32">
172
+ <dim>1</dim>
173
+ <dim>1280</dim>
174
+ </port>
175
+ </output>
176
+ </layer>
177
+ <layer id="13" name="__module.ln_post/aten::layer_norm/Multiply_1" type="Multiply" version="opset1">
178
+ <data auto_broadcast="numpy" />
179
+ <input>
180
+ <port id="0" precision="FP32">
181
+ <dim>-1</dim>
182
+ <dim>-1</dim>
183
+ </port>
184
+ <port id="1" precision="FP32">
185
+ <dim>1</dim>
186
+ <dim>1280</dim>
187
+ </port>
188
+ </input>
189
+ <output>
190
+ <port id="2" precision="FP32">
191
+ <dim>-1</dim>
192
+ <dim>1280</dim>
193
+ </port>
194
+ </output>
195
+ </layer>
196
+ <layer id="14" name="Constant_43915_compressed" type="Const" version="opset1">
197
+ <data element_type="f16" shape="1, 1280" offset="2580" size="2560" />
198
+ <output>
199
+ <port id="0" precision="FP16">
200
+ <dim>1</dim>
201
+ <dim>1280</dim>
202
+ </port>
203
+ </output>
204
+ </layer>
205
+ <layer id="15" name="Constant_43915" type="Convert" version="opset1">
206
+ <data destination_type="f32" />
207
+ <rt_info>
208
+ <attribute name="decompression" version="0" />
209
+ </rt_info>
210
+ <input>
211
+ <port id="0" precision="FP16">
212
+ <dim>1</dim>
213
+ <dim>1280</dim>
214
+ </port>
215
+ </input>
216
+ <output>
217
+ <port id="1" precision="FP32">
218
+ <dim>1</dim>
219
+ <dim>1280</dim>
220
+ </port>
221
+ </output>
222
+ </layer>
223
+ <layer id="16" name="__module.ln_post/aten::layer_norm/Add" type="Add" version="opset1">
224
+ <data auto_broadcast="numpy" />
225
+ <input>
226
+ <port id="0" precision="FP32">
227
+ <dim>-1</dim>
228
+ <dim>1280</dim>
229
+ </port>
230
+ <port id="1" precision="FP32">
231
+ <dim>1</dim>
232
+ <dim>1280</dim>
233
+ </port>
234
+ </input>
235
+ <output>
236
+ <port id="2" precision="FP32" names="28">
237
+ <dim>-1</dim>
238
+ <dim>1280</dim>
239
+ </port>
240
+ </output>
241
+ </layer>
242
+ <layer id="17" name="self.proj.weight" type="Const" version="opset1">
243
+ <data element_type="f16" shape="2048, 1280" offset="5140" size="5242880" />
244
+ <output>
245
+ <port id="0" precision="FP16" names="self.proj.weight">
246
+ <dim>2048</dim>
247
+ <dim>1280</dim>
248
+ </port>
249
+ </output>
250
+ </layer>
251
+ <layer id="18" name="__module.proj/ov_ext::linear/Convert" type="Convert" version="opset1">
252
+ <data destination_type="f32" />
253
+ <rt_info>
254
+ <attribute name="decompression" version="0" />
255
+ </rt_info>
256
+ <input>
257
+ <port id="0" precision="FP16">
258
+ <dim>2048</dim>
259
+ <dim>1280</dim>
260
+ </port>
261
+ </input>
262
+ <output>
263
+ <port id="1" precision="FP32">
264
+ <dim>2048</dim>
265
+ <dim>1280</dim>
266
+ </port>
267
+ </output>
268
+ </layer>
269
+ <layer id="19" name="__module.proj/ov_ext::linear/MatMul" type="MatMul" version="opset1">
270
+ <data transpose_a="false" transpose_b="true" />
271
+ <input>
272
+ <port id="0" precision="FP32">
273
+ <dim>-1</dim>
274
+ <dim>1280</dim>
275
+ </port>
276
+ <port id="1" precision="FP32">
277
+ <dim>2048</dim>
278
+ <dim>1280</dim>
279
+ </port>
280
+ </input>
281
+ <output>
282
+ <port id="2" precision="FP32">
283
+ <dim>-1</dim>
284
+ <dim>2048</dim>
285
+ </port>
286
+ </output>
287
+ </layer>
288
+ <layer id="20" name="self.proj.bias" type="Const" version="opset1">
289
+ <data element_type="f16" shape="2048" offset="5248020" size="4096" />
290
+ <output>
291
+ <port id="0" precision="FP16" names="self.proj.bias">
292
+ <dim>2048</dim>
293
+ </port>
294
+ </output>
295
+ </layer>
296
+ <layer id="21" name="__module.proj/ov_ext::linear/Convert_1" type="Convert" version="opset1">
297
+ <data destination_type="f32" />
298
+ <rt_info>
299
+ <attribute name="decompression" version="0" />
300
+ </rt_info>
301
+ <input>
302
+ <port id="0" precision="FP16">
303
+ <dim>2048</dim>
304
+ </port>
305
+ </input>
306
+ <output>
307
+ <port id="1" precision="FP32">
308
+ <dim>2048</dim>
309
+ </port>
310
+ </output>
311
+ </layer>
312
+ <layer id="22" name="__module.proj/ov_ext::linear/Add" type="Add" version="opset1">
313
+ <data auto_broadcast="numpy" />
314
+ <input>
315
+ <port id="0" precision="FP32">
316
+ <dim>-1</dim>
317
+ <dim>2048</dim>
318
+ </port>
319
+ <port id="1" precision="FP32">
320
+ <dim>2048</dim>
321
+ </port>
322
+ </input>
323
+ <output>
324
+ <port id="2" precision="FP32">
325
+ <dim>-1</dim>
326
+ <dim>2048</dim>
327
+ </port>
328
+ </output>
329
+ </layer>
330
+ <layer id="23" name="Result_42042" type="Result" version="opset1">
331
+ <input>
332
+ <port id="0" precision="FP32">
333
+ <dim>-1</dim>
334
+ <dim>2048</dim>
335
+ </port>
336
+ </input>
337
+ </layer>
338
+ </layers>
339
+ <edges>
340
+ <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
341
+ <edge from-layer="1" from-port="0" to-layer="2" to-port="1" />
342
+ <edge from-layer="2" from-port="2" to-layer="4" to-port="0" />
343
+ <edge from-layer="3" from-port="0" to-layer="4" to-port="1" />
344
+ <edge from-layer="3" from-port="0" to-layer="6" to-port="1" />
345
+ <edge from-layer="4" from-port="2" to-layer="5" to-port="0" />
346
+ <edge from-layer="5" from-port="1" to-layer="6" to-port="0" />
347
+ <edge from-layer="6" from-port="2" to-layer="8" to-port="0" />
348
+ <edge from-layer="7" from-port="0" to-layer="8" to-port="1" />
349
+ <edge from-layer="8" from-port="2" to-layer="10" to-port="0" />
350
+ <edge from-layer="9" from-port="0" to-layer="10" to-port="1" />
351
+ <edge from-layer="10" from-port="2" to-layer="13" to-port="0" />
352
+ <edge from-layer="11" from-port="0" to-layer="12" to-port="0" />
353
+ <edge from-layer="12" from-port="1" to-layer="13" to-port="1" />
354
+ <edge from-layer="13" from-port="2" to-layer="16" to-port="0" />
355
+ <edge from-layer="14" from-port="0" to-layer="15" to-port="0" />
356
+ <edge from-layer="15" from-port="1" to-layer="16" to-port="1" />
357
+ <edge from-layer="16" from-port="2" to-layer="19" to-port="0" />
358
+ <edge from-layer="17" from-port="0" to-layer="18" to-port="0" />
359
+ <edge from-layer="18" from-port="1" to-layer="19" to-port="1" />
360
+ <edge from-layer="19" from-port="2" to-layer="22" to-port="0" />
361
+ <edge from-layer="20" from-port="0" to-layer="21" to-port="0" />
362
+ <edge from-layer="21" from-port="1" to-layer="22" to-port="1" />
363
+ <edge from-layer="22" from-port="2" to-layer="23" to-port="0" />
364
+ </edges>
365
+ <rt_info>
366
+ <Runtime_version value="2025.1.0-18503-6fec06580ab-releases/2025/1" />
367
+ <conversion_parameters>
368
+ <framework value="pytorch" />
369
+ <is_python_object value="True" />
370
+ </conversion_parameters>
371
+ </rt_info>
372
+ </net>
thinker/openvino_thinker_embedding_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9594871f4c0615216038a5c1237a975e39af3510fec22a98925d967c59ea305b
3
+ size 622329860
thinker/openvino_thinker_embedding_model.xml ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model0" version="11">
3
+ <layers>
4
+ <layer id="0" name="input" type="Parameter" version="opset1">
5
+ <data shape="?,?" element_type="i64" />
6
+ <output>
7
+ <port id="0" precision="I64" names="input">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ </port>
11
+ </output>
12
+ </layer>
13
+ <layer id="1" name="self.weight" type="Const" version="opset1">
14
+ <data element_type="f16" shape="151936, 2048" offset="0" size="622329856" />
15
+ <output>
16
+ <port id="0" precision="FP16" names="self.weight">
17
+ <dim>151936</dim>
18
+ <dim>2048</dim>
19
+ </port>
20
+ </output>
21
+ </layer>
22
+ <layer id="2" name="ov_ext::embedding/Convert" type="Convert" version="opset1">
23
+ <data destination_type="f32" />
24
+ <rt_info>
25
+ <attribute name="decompression" version="0" />
26
+ </rt_info>
27
+ <input>
28
+ <port id="0" precision="FP16">
29
+ <dim>151936</dim>
30
+ <dim>2048</dim>
31
+ </port>
32
+ </input>
33
+ <output>
34
+ <port id="1" precision="FP32">
35
+ <dim>151936</dim>
36
+ <dim>2048</dim>
37
+ </port>
38
+ </output>
39
+ </layer>
40
+ <layer id="3" name="ov_ext::embedding/Convert_1" type="Convert" version="opset1">
41
+ <data destination_type="i32" />
42
+ <input>
43
+ <port id="0" precision="I64">
44
+ <dim>-1</dim>
45
+ <dim>-1</dim>
46
+ </port>
47
+ </input>
48
+ <output>
49
+ <port id="1" precision="I32">
50
+ <dim>-1</dim>
51
+ <dim>-1</dim>
52
+ </port>
53
+ </output>
54
+ </layer>
55
+ <layer id="4" name="ov_ext::embedding/Constant" type="Const" version="opset1">
56
+ <data element_type="i32" shape="" offset="622329856" size="4" />
57
+ <output>
58
+ <port id="0" precision="I32" />
59
+ </output>
60
+ </layer>
61
+ <layer id="5" name="ov_ext::embedding/Gather" type="Gather" version="opset8">
62
+ <data batch_dims="0" />
63
+ <input>
64
+ <port id="0" precision="FP32">
65
+ <dim>151936</dim>
66
+ <dim>2048</dim>
67
+ </port>
68
+ <port id="1" precision="I32">
69
+ <dim>-1</dim>
70
+ <dim>-1</dim>
71
+ </port>
72
+ <port id="2" precision="I32" />
73
+ </input>
74
+ <output>
75
+ <port id="3" precision="FP32">
76
+ <dim>-1</dim>
77
+ <dim>-1</dim>
78
+ <dim>2048</dim>
79
+ </port>
80
+ </output>
81
+ </layer>
82
+ <layer id="6" name="Result_9" type="Result" version="opset1">
83
+ <input>
84
+ <port id="0" precision="FP32">
85
+ <dim>-1</dim>
86
+ <dim>-1</dim>
87
+ <dim>2048</dim>
88
+ </port>
89
+ </input>
90
+ </layer>
91
+ </layers>
92
+ <edges>
93
+ <edge from-layer="0" from-port="0" to-layer="3" to-port="0" />
94
+ <edge from-layer="1" from-port="0" to-layer="2" to-port="0" />
95
+ <edge from-layer="2" from-port="1" to-layer="5" to-port="0" />
96
+ <edge from-layer="3" from-port="1" to-layer="5" to-port="1" />
97
+ <edge from-layer="4" from-port="0" to-layer="5" to-port="2" />
98
+ <edge from-layer="5" from-port="3" to-layer="6" to-port="0" />
99
+ </edges>
100
+ <rt_info>
101
+ <Runtime_version value="2025.1.0-18503-6fec06580ab-releases/2025/1" />
102
+ <conversion_parameters>
103
+ <framework value="pytorch" />
104
+ <is_python_object value="True" />
105
+ </conversion_parameters>
106
+ </rt_info>
107
+ </net>
thinker/openvino_thinker_language_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3b77cdec2255a3a48d8e942ad4883d9c81a35d360e8b1947e673a83db48029c
3
+ size 2022423292
thinker/openvino_thinker_language_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
thinker/openvino_thinker_merger_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edadc824e4b30b0c5c5858fc9d46b8a9914cca4c6f4c5e4a26545953df7307a1
3
+ size 1334358176
thinker/openvino_thinker_merger_model.xml ADDED
The diff for this file is too large to render. See raw diff
 
thinker/openvino_thinker_patcher_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5c86d6e62974622824e8a25fbfd2330eb2e9e182a7a524314a14e8215a2f022
3
+ size 3010616
thinker/openvino_thinker_patcher_model.xml ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0"?>
2
+ <net name="Model9" version="11">
3
+ <layers>
4
+ <layer id="0" name="hidden_states" type="Parameter" version="opset1">
5
+ <data shape="?,?" element_type="f32" />
6
+ <output>
7
+ <port id="0" precision="FP32" names="hidden_states">
8
+ <dim>-1</dim>
9
+ <dim>-1</dim>
10
+ </port>
11
+ </output>
12
+ </layer>
13
+ <layer id="1" name="Constant_44084" type="Const" version="opset1">
14
+ <data element_type="i64" shape="5" offset="0" size="40" />
15
+ <rt_info>
16
+ <attribute name="precise" version="0" />
17
+ </rt_info>
18
+ <output>
19
+ <port id="0" precision="I64" names="8">
20
+ <dim>5</dim>
21
+ </port>
22
+ </output>
23
+ </layer>
24
+ <layer id="2" name="aten::view/Reshape" type="Reshape" version="opset1">
25
+ <data special_zero="false" />
26
+ <input>
27
+ <port id="0" precision="FP32">
28
+ <dim>-1</dim>
29
+ <dim>-1</dim>
30
+ </port>
31
+ <port id="1" precision="I64">
32
+ <dim>5</dim>
33
+ </port>
34
+ </input>
35
+ <output>
36
+ <port id="2" precision="FP32" names="14,9,hidden_states_1">
37
+ <dim>-1</dim>
38
+ <dim>3</dim>
39
+ <dim>2</dim>
40
+ <dim>14</dim>
41
+ <dim>14</dim>
42
+ </port>
43
+ </output>
44
+ </layer>
45
+ <layer id="3" name="self.proj.weight_compressed" type="Const" version="opset1">
46
+ <data element_type="f16" shape="1280, 3, 2, 14, 14" offset="40" size="3010560" />
47
+ <output>
48
+ <port id="0" precision="FP16" names="self.proj.weight">
49
+ <dim>1280</dim>
50
+ <dim>3</dim>
51
+ <dim>2</dim>
52
+ <dim>14</dim>
53
+ <dim>14</dim>
54
+ </port>
55
+ </output>
56
+ </layer>
57
+ <layer id="4" name="self.proj.weight" type="Convert" version="opset1">
58
+ <data destination_type="f32" />
59
+ <rt_info>
60
+ <attribute name="decompression" version="0" />
61
+ </rt_info>
62
+ <input>
63
+ <port id="0" precision="FP16">
64
+ <dim>1280</dim>
65
+ <dim>3</dim>
66
+ <dim>2</dim>
67
+ <dim>14</dim>
68
+ <dim>14</dim>
69
+ </port>
70
+ </input>
71
+ <output>
72
+ <port id="1" precision="FP32">
73
+ <dim>1280</dim>
74
+ <dim>3</dim>
75
+ <dim>2</dim>
76
+ <dim>14</dim>
77
+ <dim>14</dim>
78
+ </port>
79
+ </output>
80
+ </layer>
81
+ <layer id="5" name="__module.proj/aten::_convolution/Convolution" type="Convolution" version="opset1">
82
+ <data strides="2, 14, 14" dilations="1, 1, 1" pads_begin="0, 0, 0" pads_end="0, 0, 0" auto_pad="explicit" />
83
+ <input>
84
+ <port id="0" precision="FP32">
85
+ <dim>-1</dim>
86
+ <dim>3</dim>
87
+ <dim>2</dim>
88
+ <dim>14</dim>
89
+ <dim>14</dim>
90
+ </port>
91
+ <port id="1" precision="FP32">
92
+ <dim>1280</dim>
93
+ <dim>3</dim>
94
+ <dim>2</dim>
95
+ <dim>14</dim>
96
+ <dim>14</dim>
97
+ </port>
98
+ </input>
99
+ <output>
100
+ <port id="2" precision="FP32" names="32">
101
+ <dim>-1</dim>
102
+ <dim>1280</dim>
103
+ <dim>1</dim>
104
+ <dim>1</dim>
105
+ <dim>1</dim>
106
+ </port>
107
+ </output>
108
+ </layer>
109
+ <layer id="6" name="Constant_44131" type="Const" version="opset1">
110
+ <data element_type="i64" shape="2" offset="3010600" size="16" />
111
+ <rt_info>
112
+ <attribute name="precise" version="0" />
113
+ </rt_info>
114
+ <output>
115
+ <port id="0" precision="I64" names="18">
116
+ <dim>2</dim>
117
+ </port>
118
+ </output>
119
+ </layer>
120
+ <layer id="7" name="aten::view/Reshape_1" type="Reshape" version="opset1">
121
+ <data special_zero="false" />
122
+ <input>
123
+ <port id="0" precision="FP32">
124
+ <dim>-1</dim>
125
+ <dim>1280</dim>
126
+ <dim>1</dim>
127
+ <dim>1</dim>
128
+ <dim>1</dim>
129
+ </port>
130
+ <port id="1" precision="I64">
131
+ <dim>2</dim>
132
+ </port>
133
+ </input>
134
+ <output>
135
+ <port id="2" precision="FP32">
136
+ <dim>-1</dim>
137
+ <dim>1280</dim>
138
+ </port>
139
+ </output>
140
+ </layer>
141
+ <layer id="8" name="Result_44133" type="Result" version="opset1">
142
+ <input>
143
+ <port id="0" precision="FP32">
144
+ <dim>-1</dim>
145
+ <dim>1280</dim>
146
+ </port>
147
+ </input>
148
+ </layer>
149
+ </layers>
150
+ <edges>
151
+ <edge from-layer="0" from-port="0" to-layer="2" to-port="0" />
152
+ <edge from-layer="1" from-port="0" to-layer="2" to-port="1" />
153
+ <edge from-layer="2" from-port="2" to-layer="5" to-port="0" />
154
+ <edge from-layer="3" from-port="0" to-layer="4" to-port="0" />
155
+ <edge from-layer="4" from-port="1" to-layer="5" to-port="1" />
156
+ <edge from-layer="5" from-port="2" to-layer="7" to-port="0" />
157
+ <edge from-layer="6" from-port="0" to-layer="7" to-port="1" />
158
+ <edge from-layer="7" from-port="2" to-layer="8" to-port="0" />
159
+ </edges>
160
+ <rt_info>
161
+ <Runtime_version value="2025.1.0-18503-6fec06580ab-releases/2025/1" />
162
+ <conversion_parameters>
163
+ <framework value="pytorch" />
164
+ <is_python_object value="True" />
165
+ </conversion_parameters>
166
+ </rt_info>
167
+ </net>
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8441917e39ae0244e06d704b95b3124795cec478e297f9afac39ba670d7e9d99
3
+ size 11421870
tokenizer_config.json ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ },
28
+ "151646": {
29
+ "content": "<|AUDIO|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false,
34
+ "special": true
35
+ },
36
+ "151647": {
37
+ "content": "<|audio_bos|>",
38
+ "lstrip": false,
39
+ "normalized": false,
40
+ "rstrip": false,
41
+ "single_word": false,
42
+ "special": true
43
+ },
44
+ "151648": {
45
+ "content": "<|audio_eos|>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false,
50
+ "special": true
51
+ },
52
+ "151649": {
53
+ "content": "<|box_end|>",
54
+ "lstrip": false,
55
+ "normalized": false,
56
+ "rstrip": false,
57
+ "single_word": false,
58
+ "special": true
59
+ },
60
+ "151650": {
61
+ "content": "<|quad_start|>",
62
+ "lstrip": false,
63
+ "normalized": false,
64
+ "rstrip": false,
65
+ "single_word": false,
66
+ "special": true
67
+ },
68
+ "151651": {
69
+ "content": "<|quad_end|>",
70
+ "lstrip": false,
71
+ "normalized": false,
72
+ "rstrip": false,
73
+ "single_word": false,
74
+ "special": true
75
+ },
76
+ "151652": {
77
+ "content": "<|vision_bos|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "151653": {
85
+ "content": "<|vision_eos|>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "151654": {
93
+ "content": "<|vision_pad|>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "151655": {
101
+ "content": "<|IMAGE|>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "151656": {
109
+ "content": "<|VIDEO|>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "151657": {
117
+ "content": "<tool_call>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": false
123
+ },
124
+ "151658": {
125
+ "content": "</tool_call>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": false
131
+ },
132
+ "151659": {
133
+ "content": "<|fim_prefix|>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": false
139
+ },
140
+ "151660": {
141
+ "content": "<|fim_middle|>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": false
147
+ },
148
+ "151661": {
149
+ "content": "<|fim_suffix|>",
150
+ "lstrip": false,
151
+ "normalized": false,
152
+ "rstrip": false,
153
+ "single_word": false,
154
+ "special": false
155
+ },
156
+ "151662": {
157
+ "content": "<|fim_pad|>",
158
+ "lstrip": false,
159
+ "normalized": false,
160
+ "rstrip": false,
161
+ "single_word": false,
162
+ "special": false
163
+ },
164
+ "151663": {
165
+ "content": "<|repo_name|>",
166
+ "lstrip": false,
167
+ "normalized": false,
168
+ "rstrip": false,
169
+ "single_word": false,
170
+ "special": false
171
+ },
172
+ "151664": {
173
+ "content": "<|file_sep|>",
174
+ "lstrip": false,
175
+ "normalized": false,
176
+ "rstrip": false,
177
+ "single_word": false,
178
+ "special": false
179
+ }
180
+ },
181
+ "additional_special_tokens": [
182
+ "<|im_start|>",
183
+ "<|im_end|>",
184
+ "<|AUDIO|>",
185
+ "<|audio_bos|>",
186
+ "<|audio_eos|>",
187
+ "<|box_end|>",
188
+ "<|quad_start|>",
189
+ "<|quad_end|>",
190
+ "<|vision_bos|>",
191
+ "<|vision_eos|>",
192
+ "<|vision_pad|>",
193
+ "<|IMAGE|>",
194
+ "<|VIDEO|>"
195
+ ],
196
+ "audio_bos_token": "<|audio_bos|>",
197
+ "audio_eos_token": "<|audio_eos|>",
198
+ "audio_token": "<|AUDIO|>",
199
+ "bos_token": null,
200
+ "clean_up_tokenization_spaces": false,
201
+ "eos_token": "<|im_end|>",
202
+ "errors": "replace",
203
+ "extra_special_tokens": {
204
+ "audio_bos_token": "<|audio_bos|>",
205
+ "audio_eos_token": "<|audio_eos|>",
206
+ "audio_token": "<|AUDIO|>",
207
+ "image_token": "<|IMAGE|>",
208
+ "video_token": "<|VIDEO|>",
209
+ "vision_bos_token": "<|vision_bos|>",
210
+ "vision_eos_token": "<|vision_eos|>"
211
+ },
212
+ "image_token": "<|IMAGE|>",
213
+ "model_max_length": 32768,
214
+ "pad_token": "<|endoftext|>",
215
+ "processor_class": "Qwen2_5OmniProcessor",
216
+ "split_special_tokens": false,
217
+ "tokenizer_class": "Qwen2Tokenizer",
218
+ "unk_token": null,
219
+ "video_token": "<|VIDEO|>",
220
+ "vision_bos_token": "<|vision_bos|>",
221
+ "vision_eos_token": "<|vision_eos|>"
222
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff