Parker commited on 3 days ago

Commit

d33e32a

verified ·

1 Parent(s): 467e717

Add quantized VibeVoice 7B models (4-bit and 8-bit)

Browse files

Files changed (22) hide show

.gitattributes +3 -33
4bit/README.md +35 -0
4bit/config.json +132 -0
4bit/generation_config.json +4 -0
4bit/load_quantized_4bit.py +60 -0
4bit/model-00001-of-00002.safetensors +3 -0
4bit/model-00002-of-00002.safetensors +3 -0
4bit/model.safetensors.index.json +0 -0
4bit/preprocessor_config.json +12 -0
4bit/quantization_config.json +20 -0
4bit/test_output.wav +3 -0
8bit/README.md +23 -0
8bit/config.json +132 -0
8bit/generation_config.json +4 -0
8bit/load_quantized_8bit.py +60 -0
8bit/model-00001-of-00003.safetensors +3 -0
8bit/model-00002-of-00003.safetensors +3 -0
8bit/model-00003-of-00003.safetensors +3 -0
8bit/model.safetensors.index.json +0 -0
8bit/preprocessor_config.json +12 -0
8bit/quantization_config.json +20 -0
README.md +26 -80

.gitattributes CHANGED Viewed

@@ -1,35 +1,5 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.safetensors filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

4bit/README.md ADDED Viewed

	@@ -0,0 +1,35 @@

+# VibeVoice 7B - 4-bit Quantized
+Optimized for RTX 3060/4060 and similar 12GB VRAM GPUs.
+## Specifications
+- Quantization: 4-bit (nf4)
+- Model size: 6.2 GB
+- VRAM usage: ~8 GB
+- Quality: Very good (minimal degradation)
+## Usage
+```python
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+    "Dannidee/VibeVoice7b-low-vram/4bit",
+    device_map='cuda',
+    torch_dtype=torch.bfloat16,
+)
+processor = VibeVoiceProcessor.from_pretrained("Dannidee/VibeVoice7b-low-vram/4bit")
+# Generate speech
+text = "Speaker 1: Hello! Speaker 2: Hi there!"
+inputs = processor(
+    text=[text],
+    voice_samples=[["voice1.wav", "voice2.wav"]],
+    padding=True,
+    return_tensors="pt",
+)
+outputs = model.generate(**inputs)
+processor.save_audio(outputs.speech_outputs[0], "output.wav")
+```

4bit/config.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "acoustic_vae_dim": 64,
+  "acoustic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "decoder_depths": null,
+    "decoder_n_filters": 32,
+    "decoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "disable_last_norm": true,
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0.5,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_acoustic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "gaussian",
+    "vae_dim": 64,
+    "weight_init_value": 0.01
+  },
+  "architectures": [
+    "VibeVoiceForConditionalGeneration"
+  ],
+  "decoder_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "model_type": "qwen2",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_mrope": false,
+    "use_sliding_window": false,
+    "vocab_size": 152064
+  },
+  "diffusion_head_config": {
+    "ddpm_batch_mul": 4,
+    "ddpm_beta_schedule": "cosine",
+    "ddpm_num_inference_steps": 20,
+    "ddpm_num_steps": 1000,
+    "diffusion_type": "ddpm",
+    "head_ffn_ratio": 3.0,
+    "head_layers": 4,
+    "hidden_size": 3584,
+    "latent_size": 64,
+    "model_type": "vibevoice_diffusion_head",
+    "prediction_type": "v_prediction",
+    "rms_norm_eps": 1e-05,
+    "speech_vae_dim": 64
+  },
+  "model_type": "vibevoice",
+  "semantic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "disable_last_norm": true,
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_semantic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "none",
+    "vae_dim": 128,
+    "weight_init_value": 0.01
+  },
+  "semantic_vae_dim": 128,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "quantization_config": {
+    "quant_method": "bitsandbytes",
+    "_load_in_8bit": false,
+    "_load_in_4bit": true,
+    "llm_int8_threshold": 6.0,
+    "llm_int8_skip_modules": null,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
+    "load_in_4bit": true,
+    "load_in_8bit": false
+  },
+  "_quantization_method": "bitsandbytes"
+}

4bit/generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.51.3"
+}

4bit/load_quantized_4bit.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python
+"""
+Load and use the 4-bit quantized VibeVoice model
+"""
+import torch
+from transformers import BitsAndBytesConfig
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+def load_quantized_model(model_path="/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit"):
+    """Load the pre-quantized VibeVoice model"""
+    print("Loading 4-bit quantized VibeVoice model...")
+    # The model is already quantized, but we need to specify the config
+    # to ensure proper loading of quantized weights
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True,
+        bnb_4bit_quant_type='nf4'
+    )
+    # Load processor
+    processor = VibeVoiceProcessor.from_pretrained(model_path)
+    # Load model
+    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+        model_path,
+        quantization_config=bnb_config,
+        device_map='cuda',
+        torch_dtype=torch.bfloat16,
+    )
+    model.eval()
+    print("✅ Model loaded successfully!")
+    print(f"💾 Memory usage: {torch.cuda.memory_allocated() / 1e9:.1f} GB")
+    return model, processor
+# Example usage
+if __name__ == "__main__":
+    model, processor = load_quantized_model()
+    # Generate audio
+    text = "Speaker 1: Hello! Speaker 2: Hi there!"
+    inputs = processor(
+        text=[text],
+        voice_samples=[["path/to/voice1.wav", "path/to/voice2.wav"]],
+        padding=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        outputs = model.generate(**inputs)
+    # Save audio
+    processor.save_audio(outputs.speech_outputs[0], "output.wav")

4bit/model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb5b251fc215bcd5ee9d74bc4383aece74ab8a4e38d402017bcf349c20cc02a6
+size 4949701350

4bit/model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:40b3e0fe570de93c87842f6bca65a4c4012e41920a6a5342b967206db781b603
+size 1676476837

4bit/model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

4bit/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "processor_class": "VibeVoiceProcessor",
+  "speech_tok_compress_ratio": 3200,
+  "db_normalize": true,
+  "audio_processor": {
+    "feature_extractor_type": "VibeVoiceTokenizerProcessor",
+    "sampling_rate": 24000,
+    "normalize_audio": true,
+    "target_dB_FS": -25,
+    "eps": 1e-06
+  }
+}

4bit/quantization_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "quantization_config": {
+    "quant_method": "bitsandbytes",
+    "_load_in_8bit": false,
+    "_load_in_4bit": true,
+    "llm_int8_threshold": 6.0,
+    "llm_int8_skip_modules": null,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "bnb_4bit_quant_type": "nf4",
+    "bnb_4bit_use_double_quant": true,
+    "bnb_4bit_compute_dtype": "bfloat16",
+    "bnb_4bit_quant_storage": "uint8",
+    "load_in_4bit": true,
+    "load_in_8bit": false
+  },
+  "quantization_method": "bitsandbytes",
+  "bits": 4,
+  "quant_type": "nf4"
+}

4bit/test_output.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:950196552c3c539a6169f5db79fbaf61106945adcdc102e9739c5ec7dc02d83c
+size 236844

8bit/README.md ADDED Viewed

	@@ -0,0 +1,23 @@

+# VibeVoice 7B - 8-bit Quantized
+Better quality with moderate VRAM requirements.
+## Specifications
+- Quantization: 8-bit (int8)
+- Model size: 9.9 GB
+- VRAM usage: ~12 GB
+- Quality: Excellent (minimal degradation)
+## Usage
+```python
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+    "Dannidee/VibeVoice7b-low-vram/8bit",
+    device_map='cuda',
+    torch_dtype=torch.bfloat16,
+)
+processor = VibeVoiceProcessor.from_pretrained("Dannidee/VibeVoice7b-low-vram/8bit")
+```

8bit/config.json ADDED Viewed

	@@ -0,0 +1,132 @@

+{
+  "acoustic_vae_dim": 64,
+  "acoustic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "decoder_depths": null,
+    "decoder_n_filters": 32,
+    "decoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "disable_last_norm": true,
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0.5,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_acoustic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "gaussian",
+    "vae_dim": 64,
+    "weight_init_value": 0.01
+  },
+  "architectures": [
+    "VibeVoiceForConditionalGeneration"
+  ],
+  "decoder_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "model_type": "qwen2",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "torch_dtype": "bfloat16",
+    "use_cache": true,
+    "use_mrope": false,
+    "use_sliding_window": false,
+    "vocab_size": 152064
+  },
+  "diffusion_head_config": {
+    "ddpm_batch_mul": 4,
+    "ddpm_beta_schedule": "cosine",
+    "ddpm_num_inference_steps": 20,
+    "ddpm_num_steps": 1000,
+    "diffusion_type": "ddpm",
+    "head_ffn_ratio": 3.0,
+    "head_layers": 4,
+    "hidden_size": 3584,
+    "latent_size": 64,
+    "model_type": "vibevoice_diffusion_head",
+    "prediction_type": "v_prediction",
+    "rms_norm_eps": 1e-05,
+    "speech_vae_dim": 64
+  },
+  "model_type": "vibevoice",
+  "semantic_tokenizer_config": {
+    "causal": true,
+    "channels": 1,
+    "conv_bias": true,
+    "conv_norm": "none",
+    "corpus_normalize": 0.0,
+    "disable_last_norm": true,
+    "encoder_depths": "3-3-3-3-3-3-8",
+    "encoder_n_filters": 32,
+    "encoder_ratios": [
+      8,
+      5,
+      5,
+      4,
+      2,
+      2
+    ],
+    "fix_std": 0,
+    "layer_scale_init_value": 1e-06,
+    "layernorm": "RMSNorm",
+    "layernorm_elementwise_affine": true,
+    "layernorm_eps": 1e-05,
+    "mixer_layer": "depthwise_conv",
+    "model_type": "vibevoice_semantic_tokenizer",
+    "pad_mode": "constant",
+    "std_dist_type": "none",
+    "vae_dim": 128,
+    "weight_init_value": 0.01
+  },
+  "semantic_vae_dim": 128,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.51.3",
+  "quantization_config": {
+    "quant_method": "bitsandbytes",
+    "_load_in_8bit": true,
+    "_load_in_4bit": false,
+    "llm_int8_threshold": 6.0,
+    "llm_int8_skip_modules": null,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "load_in_4bit": false,
+    "load_in_8bit": true
+  },
+  "_quantization_method": "bitsandbytes"
+}

8bit/generation_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "_from_model_config": true,
+  "transformers_version": "4.51.3"
+}

8bit/load_quantized_8bit.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python
+"""
+Load and use the 8-bit quantized VibeVoice model
+"""
+import torch
+from transformers import BitsAndBytesConfig
+from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
+from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+def load_quantized_model(model_path="/home/deveraux/Desktop/vibevoice/VibeVoice-Large-8bit"):
+    """Load the pre-quantized VibeVoice model"""
+    print("Loading 8-bit quantized VibeVoice model...")
+    # The model is already quantized, but we need to specify the config
+    # to ensure proper loading of quantized weights
+    bnb_config = BitsAndBytesConfig(
+        load_in_8bit=True,
+        bnb_8bit_compute_dtype=torch.bfloat16,
+    )
+    # Load processor
+    processor = VibeVoiceProcessor.from_pretrained(model_path)
+    # Load model
+    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+        model_path,
+        quantization_config=bnb_config,
+        device_map='cuda',
+        torch_dtype=torch.bfloat16,
+    )
+    model.eval()
+    print("✅ Model loaded successfully!")
+    print(f"💾 Memory usage: {torch.cuda.memory_allocated() / 1e9:.1f} GB")
+    return model, processor
+# Example usage
+if __name__ == "__main__":
+    model, processor = load_quantized_model()
+    # Generate audio
+    text = "Speaker 1: Hello! Speaker 2: Hi there!"
+    inputs = processor(
+        text=[text],
+        voice_samples=[["path/to/voice1.wav", "path/to/voice2.wav"]],
+        padding=True,
+        return_tensors="pt",
+    )
+    with torch.no_grad():
+        outputs = model.generate(**inputs)
+    # Save audio
+    processor.save_audio(outputs.speech_outputs[0], "output.wav")

8bit/model-00001-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68f98075dac463766219e6e61ff5fe9ab969f8fea621a65906f1d6793f2eaf72
+size 4987685394

8bit/model-00002-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48940fb59366de226af5df46020f022d4d651f4563f190142c175b5bf733e9c7
+size 4489976774

8bit/model-00003-of-00003.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d83c0514c0c9d2675cb4d51ee56b12515ea45770ce35acc5ab0ec4bc7d1bef73
+size 1089994880

8bit/model.safetensors.index.json ADDED Viewed

The diff for this file is too large to render. See raw diff

8bit/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "processor_class": "VibeVoiceProcessor",
+  "speech_tok_compress_ratio": 3200,
+  "db_normalize": true,
+  "audio_processor": {
+    "feature_extractor_type": "VibeVoiceTokenizerProcessor",
+    "sampling_rate": 24000,
+    "normalize_audio": true,
+    "target_dB_FS": -25,
+    "eps": 1e-06
+  }
+}

8bit/quantization_config.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "quantization_config": {
+    "quant_method": "bitsandbytes",
+    "_load_in_8bit": true,
+    "_load_in_4bit": false,
+    "llm_int8_threshold": 6.0,
+    "llm_int8_skip_modules": null,
+    "llm_int8_enable_fp32_cpu_offload": false,
+    "llm_int8_has_fp16_weight": false,
+    "bnb_4bit_quant_type": "fp4",
+    "bnb_4bit_use_double_quant": false,
+    "bnb_4bit_compute_dtype": "float32",
+    "bnb_4bit_quant_storage": "uint8",
+    "load_in_4bit": false,
+    "load_in_8bit": true
+  },
+  "quantization_method": "bitsandbytes",
+  "bits": 8,
+  "quant_type": "nf4"
+}

README.md CHANGED Viewed

@@ -1,97 +1,43 @@
-Here's some quantized VibeVoice 7b models, both 8 and 4 bit, along with some simple python code to test them out.
-## Model Sizes
-| Model Version | Size | Memory Usage | Quality |
-|---------------|------|--------------|---------|
-| Original (fp16/bf16) | 18GB | ~18GB VRAM | Best |
-| 8-bit Quantized | 9.9GB | ~10.6GB VRAM | Excellent |
-| 4-bit Quantized (nf4) | 6.2GB | ~6.6GB VRAM | Very Good |
-## How to Use Pre-Quantized Models
-### 1. Loading 4-bit Model
 ```python
 from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
 from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
-# Load pre-quantized 4-bit model
-model_path = "/path/to/VibeVoice-Large-4bit"
-processor = VibeVoiceProcessor.from_pretrained(model_path)
 model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-    model_path,
     device_map='cuda',
     torch_dtype=torch.bfloat16,
 )
 ```
-### 2. Loading 8-bit Model
-```python
-# Same code, just point to 8-bit model
-model_path = "/path/to/VibeVoice-Large-8bit"
-# ... rest is the same
-```
-## Creating Your Own Quantized Models
-Use the provided script to quantize models:
-```bash
-# 4-bit quantization (nf4)
-python quantize_and_save_vibevoice.py \
-    --model_path /path/to/original/model \
-    --output_dir /path/to/output/4bit \
-    --bits 4 \
-    --test
-# 8-bit quantization
-python quantize_and_save_vibevoice.py \
-    --model_path /path/to/original/model \
-    --output_dir /path/to/output/8bit \
-    --bits 8 \
-    --test
-```
-## Benefits
-1. **Pre-quantized models load faster** - No on-the-fly quantization needed
-2. **Lower VRAM requirements** - 4-bit uses only ~6.6GB vs 18GB
-3. **Shareable** - Upload the quantized folder to share with others
-4. **Quality preserved** - nf4 quantization maintains excellent output quality
-## Distribution
-To share quantized models:
-1. Upload the entire quantized model directory (e.g., `VibeVoice-Large-4bit/`)
-2. Include the `quantization_config.json` file (automatically created)
-3. Users can load directly without any quantization setup
-## Performance Notes
-- 4-bit (nf4): Best for memory-constrained systems, minimal quality loss
-- 8-bit: Better quality than 4-bit, still significant memory savings
-- Both versions maintain the same generation speed as the original
-- Flash Attention 2 is supported in all quantized versions
-## Troubleshooting
-If loading fails:
-1. Ensure you have `bitsandbytes` installed: `pip install bitsandbytes`
-2. Make sure you're on a CUDA-capable GPU
-3. Check that all model files are present in the directory
-## Files Created
-Each quantized model directory contains:
-- `model.safetensors.*` - Quantized model weights
-- `config.json` - Model configuration with quantization settings
-- `quantization_config.json` - Specific quantization parameters
-- `processor/` - Audio processor files
-- `load_quantized_Xbit.py` - Example loading script
----
-license: mit
----

+---
+license: other
+language:
+- en
+tags:
+- text-to-speech
+- speech-synthesis
+- quantized
+- low-vram
+- vibevoice
+---
+# VibeVoice 7B - Low VRAM Quantized Models
+Pre-quantized versions of VibeVoice 7B for low VRAM GPUs.
+## Available Versions
+- **4bit/** - 4-bit quantized model (~8GB VRAM needed)
+- **8bit/** - 8-bit quantized model (~12GB VRAM needed)
+## Usage
 ```python
 from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
 from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
+# For 4-bit model
 model = VibeVoiceForConditionalGenerationInference.from_pretrained(
+    "Dannidee/VibeVoice7b-low-vram/4bit",
     device_map='cuda',
     torch_dtype=torch.bfloat16,
 )
+processor = VibeVoiceProcessor.from_pretrained("Dannidee/VibeVoice7b-low-vram/4bit")
 ```
+## VRAM Requirements
+- **4-bit**: ~8 GB total VRAM
+- **8-bit**: ~12 GB total VRAM
+- **Original**: ~19 GB total VRAM
+See individual model folders for detailed information.