Delete 8bit

Browse files

Files changed (15) hide show

8bit/QUANTIZATION_README.md +0 -95
8bit/README.md +0 -23
8bit/config.json +0 -132
8bit/generation_config.json +0 -4
8bit/load_quantized_8bit.py +0 -60
8bit/minimal_memory_output.wav +0 -3
8bit/model-00001-of-00003.safetensors +0 -3
8bit/model-00002-of-00003.safetensors +0 -3
8bit/model-00003-of-00003.safetensors +0 -3
8bit/model.safetensors.index.json +0 -0
8bit/preprocessor_config.json +0 -12
8bit/quantization_config.json +0 -20
8bit/quantize_and_save_vibevoice.py +0 -330
8bit/test_accurate_vram.py +0 -207
8bit/use_quantized_model.py +0 -70

8bit/QUANTIZATION_README.md DELETED Viewed

@@ -1,95 +0,0 @@
-# VibeVoice Quantization Guide
-Successfully quantized VibeVoice 7B model to both 4-bit and 8-bit versions using bitsandbytes!
-## Model Sizes
-| Model Version | Size | Memory Usage | Quality |
-|---------------|------|--------------|---------|
-| Original (fp16/bf16) | 18GB | ~18GB VRAM | Best |
-| 8-bit Quantized | 9.9GB | ~10.6GB VRAM | Excellent |
-| 4-bit Quantized (nf4) | 6.2GB | ~6.6GB VRAM | Very Good |
-## How to Use Pre-Quantized Models
-### 1. Loading 4-bit Model
-```python
-from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
-# Load pre-quantized 4-bit model
-model_path = "/path/to/VibeVoice-Large-4bit"
-processor = VibeVoiceProcessor.from_pretrained(model_path)
-model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-    model_path,
-    device_map='cuda',
-    torch_dtype=torch.bfloat16,
-)
-```
-### 2. Loading 8-bit Model
-```python
-# Same code, just point to 8-bit model
-model_path = "/path/to/VibeVoice-Large-8bit"
-# ... rest is the same
-```
-## Creating Your Own Quantized Models
-Use the provided script to quantize models:
-```bash
-# 4-bit quantization (nf4)
-python quantize_and_save_vibevoice.py \
-    --model_path /path/to/original/model \
-    --output_dir /path/to/output/4bit \
-    --bits 4 \
-    --test
-# 8-bit quantization
-python quantize_and_save_vibevoice.py \
-    --model_path /path/to/original/model \
-    --output_dir /path/to/output/8bit \
-    --bits 8 \
-    --test
-```
-## Benefits
-1. **Pre-quantized models load faster** - No on-the-fly quantization needed
-2. **Lower VRAM requirements** - 4-bit uses only ~6.6GB vs 18GB
-3. **Shareable** - Upload the quantized folder to share with others
-4. **Quality preserved** - nf4 quantization maintains excellent output quality
-## Distribution
-To share quantized models:
-1. Upload the entire quantized model directory (e.g., `VibeVoice-Large-4bit/`)
-2. Include the `quantization_config.json` file (automatically created)
-3. Users can load directly without any quantization setup
-## Performance Notes
-- 4-bit (nf4): Best for memory-constrained systems, minimal quality loss
-- 8-bit: Better quality than 4-bit, still significant memory savings
-- Both versions maintain the same generation speed as the original
-- Flash Attention 2 is supported in all quantized versions
-## Troubleshooting
-If loading fails:
-1. Ensure you have `bitsandbytes` installed: `pip install bitsandbytes`
-2. Make sure you're on a CUDA-capable GPU
-3. Check that all model files are present in the directory
-## Files Created
-Each quantized model directory contains:
-- `model.safetensors.*` - Quantized model weights
-- `config.json` - Model configuration with quantization settings
-- `quantization_config.json` - Specific quantization parameters
-- `processor/` - Audio processor files
-- `load_quantized_Xbit.py` - Example loading script

8bit/README.md DELETED Viewed

@@ -1,23 +0,0 @@
-# VibeVoice 7B - 8-bit Quantized
-Better quality with moderate VRAM requirements.
-## Specifications
-- Quantization: 8-bit (int8)
-- Model size: 9.9 GB
-- VRAM usage: ~12 GB
-- Quality: Excellent (minimal degradation)
-## Usage
-```python
-from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
-model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-    "Dannidee/VibeVoice7b-low-vram/8bit",
-    device_map='cuda',
-    torch_dtype=torch.bfloat16,
-)
-processor = VibeVoiceProcessor.from_pretrained("Dannidee/VibeVoice7b-low-vram/8bit")
-```

8bit/config.json DELETED Viewed

@@ -1,132 +0,0 @@
-{
-  "acoustic_vae_dim": 64,
-  "acoustic_tokenizer_config": {
-    "causal": true,
-    "channels": 1,
-    "conv_bias": true,
-    "conv_norm": "none",
-    "corpus_normalize": 0.0,
-    "decoder_depths": null,
-    "decoder_n_filters": 32,
-    "decoder_ratios": [
-      8,
-      5,
-      5,
-      4,
-      2,
-      2
-    ],
-    "disable_last_norm": true,
-    "encoder_depths": "3-3-3-3-3-3-8",
-    "encoder_n_filters": 32,
-    "encoder_ratios": [
-      8,
-      5,
-      5,
-      4,
-      2,
-      2
-    ],
-    "fix_std": 0.5,
-    "layer_scale_init_value": 1e-06,
-    "layernorm": "RMSNorm",
-    "layernorm_elementwise_affine": true,
-    "layernorm_eps": 1e-05,
-    "mixer_layer": "depthwise_conv",
-    "model_type": "vibevoice_acoustic_tokenizer",
-    "pad_mode": "constant",
-    "std_dist_type": "gaussian",
-    "vae_dim": 64,
-    "weight_init_value": 0.01
-  },
-  "architectures": [
-    "VibeVoiceForConditionalGeneration"
-  ],
-  "decoder_config": {
-    "attention_dropout": 0.0,
-    "hidden_act": "silu",
-    "hidden_size": 3584,
-    "initializer_range": 0.02,
-    "intermediate_size": 18944,
-    "max_position_embeddings": 32768,
-    "max_window_layers": 28,
-    "model_type": "qwen2",
-    "num_attention_heads": 28,
-    "num_hidden_layers": 28,
-    "num_key_value_heads": 4,
-    "rms_norm_eps": 1e-06,
-    "rope_scaling": null,
-    "rope_theta": 1000000.0,
-    "sliding_window": null,
-    "torch_dtype": "bfloat16",
-    "use_cache": true,
-    "use_mrope": false,
-    "use_sliding_window": false,
-    "vocab_size": 152064
-  },
-  "diffusion_head_config": {
-    "ddpm_batch_mul": 4,
-    "ddpm_beta_schedule": "cosine",
-    "ddpm_num_inference_steps": 20,
-    "ddpm_num_steps": 1000,
-    "diffusion_type": "ddpm",
-    "head_ffn_ratio": 3.0,
-    "head_layers": 4,
-    "hidden_size": 3584,
-    "latent_size": 64,
-    "model_type": "vibevoice_diffusion_head",
-    "prediction_type": "v_prediction",
-    "rms_norm_eps": 1e-05,
-    "speech_vae_dim": 64
-  },
-  "model_type": "vibevoice",
-  "semantic_tokenizer_config": {
-    "causal": true,
-    "channels": 1,
-    "conv_bias": true,
-    "conv_norm": "none",
-    "corpus_normalize": 0.0,
-    "disable_last_norm": true,
-    "encoder_depths": "3-3-3-3-3-3-8",
-    "encoder_n_filters": 32,
-    "encoder_ratios": [
-      8,
-      5,
-      5,
-      4,
-      2,
-      2
-    ],
-    "fix_std": 0,
-    "layer_scale_init_value": 1e-06,
-    "layernorm": "RMSNorm",
-    "layernorm_elementwise_affine": true,
-    "layernorm_eps": 1e-05,
-    "mixer_layer": "depthwise_conv",
-    "model_type": "vibevoice_semantic_tokenizer",
-    "pad_mode": "constant",
-    "std_dist_type": "none",
-    "vae_dim": 128,
-    "weight_init_value": 0.01
-  },
-  "semantic_vae_dim": 128,
-  "tie_word_embeddings": false,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.51.3",
-  "quantization_config": {
-    "quant_method": "bitsandbytes",
-    "_load_in_8bit": true,
-    "_load_in_4bit": false,
-    "llm_int8_threshold": 6.0,
-    "llm_int8_skip_modules": null,
-    "llm_int8_enable_fp32_cpu_offload": false,
-    "llm_int8_has_fp16_weight": false,
-    "bnb_4bit_quant_type": "fp4",
-    "bnb_4bit_use_double_quant": false,
-    "bnb_4bit_compute_dtype": "float32",
-    "bnb_4bit_quant_storage": "uint8",
-    "load_in_4bit": false,
-    "load_in_8bit": true
-  },
-  "_quantization_method": "bitsandbytes"
-}

8bit/generation_config.json DELETED Viewed

@@ -1,4 +0,0 @@
-{
-  "_from_model_config": true,
-  "transformers_version": "4.51.3"
-}

8bit/load_quantized_8bit.py DELETED Viewed

@@ -1,60 +0,0 @@
-#!/usr/bin/env python
-"""
-Load and use the 8-bit quantized VibeVoice model
-"""
-import torch
-from transformers import BitsAndBytesConfig
-from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
-def load_quantized_model(model_path="/home/deveraux/Desktop/vibevoice/VibeVoice-Large-8bit"):
-    """Load the pre-quantized VibeVoice model"""
-    print("Loading 8-bit quantized VibeVoice model...")
-    # The model is already quantized, but we need to specify the config
-    # to ensure proper loading of quantized weights
-    bnb_config = BitsAndBytesConfig(
-        load_in_8bit=True,
-        bnb_8bit_compute_dtype=torch.bfloat16,
-    )
-    # Load processor
-    processor = VibeVoiceProcessor.from_pretrained(model_path)
-    # Load model
-    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-        model_path,
-        quantization_config=bnb_config,
-        device_map='cuda',
-        torch_dtype=torch.bfloat16,
-    )
-    model.eval()
-    print("✅ Model loaded successfully!")
-    print(f"💾 Memory usage: {torch.cuda.memory_allocated() / 1e9:.1f} GB")
-    return model, processor
-# Example usage
-if __name__ == "__main__":
-    model, processor = load_quantized_model()
-    # Generate audio
-    text = "Speaker 1: Hello! Speaker 2: Hi there!"
-    inputs = processor(
-        text=[text],
-        voice_samples=[["path/to/voice1.wav", "path/to/voice2.wav"]],
-        padding=True,
-        return_tensors="pt",
-    )
-    with torch.no_grad():
-        outputs = model.generate(**inputs)
-    # Save audio
-    processor.save_audio(outputs.speech_outputs[0], "output.wav")

8bit/minimal_memory_output.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c3cf133304229512e369c0b4db51c7d8ebbab43dd8c7945b5bf8e9b727185893
-size 313644

8bit/model-00001-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:68f98075dac463766219e6e61ff5fe9ab969f8fea621a65906f1d6793f2eaf72
-size 4987685394

8bit/model-00002-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:48940fb59366de226af5df46020f022d4d651f4563f190142c175b5bf733e9c7
-size 4489976774

8bit/model-00003-of-00003.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d83c0514c0c9d2675cb4d51ee56b12515ea45770ce35acc5ab0ec4bc7d1bef73
-size 1089994880

8bit/model.safetensors.index.json DELETED Viewed

The diff for this file is too large to render. See raw diff

8bit/preprocessor_config.json DELETED Viewed

@@ -1,12 +0,0 @@
-{
-  "processor_class": "VibeVoiceProcessor",
-  "speech_tok_compress_ratio": 3200,
-  "db_normalize": true,
-  "audio_processor": {
-    "feature_extractor_type": "VibeVoiceTokenizerProcessor",
-    "sampling_rate": 24000,
-    "normalize_audio": true,
-    "target_dB_FS": -25,
-    "eps": 1e-06
-  }
-}

8bit/quantization_config.json DELETED Viewed

@@ -1,20 +0,0 @@
-{
-  "quantization_config": {
-    "quant_method": "bitsandbytes",
-    "_load_in_8bit": true,
-    "_load_in_4bit": false,
-    "llm_int8_threshold": 6.0,
-    "llm_int8_skip_modules": null,
-    "llm_int8_enable_fp32_cpu_offload": false,
-    "llm_int8_has_fp16_weight": false,
-    "bnb_4bit_quant_type": "fp4",
-    "bnb_4bit_use_double_quant": false,
-    "bnb_4bit_compute_dtype": "float32",
-    "bnb_4bit_quant_storage": "uint8",
-    "load_in_4bit": false,
-    "load_in_8bit": true
-  },
-  "quantization_method": "bitsandbytes",
-  "bits": 8,
-  "quant_type": "nf4"
-}

8bit/quantize_and_save_vibevoice.py DELETED Viewed

@@ -1,330 +0,0 @@
-#!/usr/bin/env python
-"""
-Quantize and save VibeVoice model using bitsandbytes
-Creates a pre-quantized model that can be shared and loaded directly
-"""
-import os
-import json
-import shutil
-import torch
-from pathlib import Path
-from transformers import BitsAndBytesConfig
-from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
-from transformers.utils import logging
-from safetensors.torch import save_file
-logging.set_verbosity_info()
-def quantize_and_save_model(
-    model_path: str,
-    output_dir: str,
-    bits: int = 4,
-    quant_type: str = "nf4"
-):
-    """Quantize VibeVoice model and save it for distribution"""
-    print(f"\n{'='*70}")
-    print(f"VIBEVOICE QUANTIZATION - {bits}-bit ({quant_type})")
-    print(f"{'='*70}")
-    print(f"Source: {model_path}")
-    print(f"Output: {output_dir}")
-    print(f"{'='*70}\n")
-    # Create output directory
-    output_path = Path(output_dir)
-    output_path.mkdir(parents=True, exist_ok=True)
-    # Configure quantization
-    if bits == 4:
-        bnb_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.bfloat16,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type=quant_type
-        )
-    elif bits == 8:
-        bnb_config = BitsAndBytesConfig(
-            load_in_8bit=True,
-            bnb_8bit_compute_dtype=torch.bfloat16,
-        )
-    else:
-        raise ValueError(f"Unsupported bit width: {bits}")
-    print("🔧 Loading and quantizing model...")
-    # Load the model with quantization
-    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-        model_path,
-        quantization_config=bnb_config,
-        device_map='cuda',
-        torch_dtype=torch.bfloat16,
-    )
-    # Get memory usage
-    memory_gb = torch.cuda.memory_allocated() / 1e9
-    print(f"💾 Quantized model memory usage: {memory_gb:.1f} GB")
-    # Save the quantized model
-    print("\n📦 Saving quantized model...")
-    # Method 1: Try using save_pretrained with quantization info
-    try:
-        # Save model with quantization config
-        model.save_pretrained(
-            output_path,
-            safe_serialization=True,
-            max_shard_size="5GB"
-        )
-        # Save the quantization config separately
-        quant_config_dict = {
-            "quantization_config": bnb_config.to_dict(),
-            "quantization_method": "bitsandbytes",
-            "bits": bits,
-            "quant_type": quant_type
-        }
-        with open(output_path / "quantization_config.json", 'w') as f:
-            json.dump(quant_config_dict, f, indent=2)
-        print("✅ Model saved with integrated quantization")
-    except Exception as e:
-        print(f"⚠️ Standard save failed: {e}")
-        print("Trying alternative save method...")
-        # Method 2: Save state dict with quantized weights
-        save_quantized_state_dict(model, output_path, bnb_config)
-    # Copy processor files
-    print("\n📋 Copying processor files...")
-    processor = VibeVoiceProcessor.from_pretrained(model_path)
-    processor.save_pretrained(output_path)
-    # Copy additional config files
-    for file in ["config.json", "generation_config.json"]:
-        src = Path(model_path) / file
-        if src.exists():
-            shutil.copy2(src, output_path / file)
-    # Update config to indicate quantization
-    config_path = output_path / "config.json"
-    if config_path.exists():
-        with open(config_path, 'r') as f:
-            config = json.load(f)
-        config["quantization_config"] = bnb_config.to_dict()
-        config["_quantization_method"] = "bitsandbytes"
-        with open(config_path, 'w') as f:
-            json.dump(config, f, indent=2)
-    print(f"\n✅ Quantized model saved to: {output_path}")
-    # Create loading script
-    create_loading_script(output_path, bits, quant_type)
-    return output_path
-def save_quantized_state_dict(model, output_path, bnb_config):
-    """Alternative method to save quantized weights"""
-    print("\n🔧 Saving quantized state dict...")
-    # Get the state dict
-    state_dict = model.state_dict()
-    # Separate quantized and non-quantized parameters
-    quantized_state = {}
-    metadata = {
-        "quantized_modules": [],
-        "quantization_config": bnb_config.to_dict()
-    }
-    for name, param in state_dict.items():
-        # Check if this is a quantized parameter
-        if hasattr(param, 'quant_state'):
-            # Store quantization state
-            metadata["quantized_modules"].append(name)
-            quantized_state[name] = param.data
-        else:
-            # Regular parameter
-            quantized_state[name] = param
-    # Save using safetensors
-    save_file(quantized_state, output_path / "model.safetensors", metadata=metadata)
-    # Save metadata
-    with open(output_path / "quantization_metadata.json", 'w') as f:
-        json.dump(metadata, f, indent=2)
-def create_loading_script(output_path, bits, quant_type):
-    """Create a script to load the quantized model"""
-    script_content = f'''#!/usr/bin/env python
-"""
-Load and use the {bits}-bit quantized VibeVoice model
-"""
-import torch
-from transformers import BitsAndBytesConfig
-from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
-def load_quantized_model(model_path="{output_path}"):
-    """Load the pre-quantized VibeVoice model"""
-    print("Loading {bits}-bit quantized VibeVoice model...")
-    # The model is already quantized, but we need to specify the config
-    # to ensure proper loading of quantized weights
-    bnb_config = BitsAndBytesConfig(
-        load_in_{bits}bit=True,
-        bnb_{bits}bit_compute_dtype=torch.bfloat16,
-        {"bnb_4bit_use_double_quant=True," if bits == 4 else ""}
-        {"bnb_4bit_quant_type='" + quant_type + "'" if bits == 4 else ""}
-    )
-    # Load processor
-    processor = VibeVoiceProcessor.from_pretrained(model_path)
-    # Load model
-    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-        model_path,
-        quantization_config=bnb_config,
-        device_map='cuda',
-        torch_dtype=torch.bfloat16,
-    )
-    model.eval()
-    print("✅ Model loaded successfully!")
-    print(f"💾 Memory usage: {{torch.cuda.memory_allocated() / 1e9:.1f}} GB")
-    return model, processor
-# Example usage
-if __name__ == "__main__":
-    model, processor = load_quantized_model()
-    # Generate audio
-    text = "Speaker 1: Hello! Speaker 2: Hi there!"
-    inputs = processor(
-        text=[text],
-        voice_samples=[["path/to/voice1.wav", "path/to/voice2.wav"]],
-        padding=True,
-        return_tensors="pt",
-    )
-    with torch.no_grad():
-        outputs = model.generate(**inputs)
-    # Save audio
-    processor.save_audio(outputs.speech_outputs[0], "output.wav")
-'''
-    script_path = output_path / f"load_quantized_{bits}bit.py"
-    with open(script_path, 'w') as f:
-        f.write(script_content)
-    print(f"📝 Created loading script: {script_path}")
-def test_quantized_model(model_path):
-    """Test loading and generating with the quantized model"""
-    print(f"\n🧪 Testing quantized model from: {model_path}")
-    try:
-        # Load the quantized model
-        processor = VibeVoiceProcessor.from_pretrained(model_path)
-        # Load with auto-detection of quantization
-        model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-            model_path,
-            device_map='cuda',
-            torch_dtype=torch.bfloat16,
-        )
-        print("✅ Model loaded successfully!")
-        # Quick generation test
-        test_text = "Speaker 1: Testing quantized model. Speaker 2: It works!"
-        print(f"\n🎤 Testing generation with: '{test_text}'")
-        # Use demo voices
-        voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
-        speaker_voices = [
-            os.path.join(voices_dir, "en-Alice_woman.wav"),
-            os.path.join(voices_dir, "en-Carter_man.wav")
-        ]
-        inputs = processor(
-            text=[test_text],
-            voice_samples=[speaker_voices],
-            padding=True,
-            return_tensors="pt",
-            return_attention_mask=True,
-        )
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens=None,
-                cfg_scale=1.3,
-                tokenizer=processor.tokenizer,
-                generation_config={'do_sample': False},
-            )
-        print("✅ Generation successful!")
-        # Save test output
-        output_path = Path(model_path) / "test_output.wav"
-        processor.save_audio(outputs.speech_outputs[0], output_path=str(output_path))
-        print(f"🔊 Test audio saved to: {output_path}")
-        return True
-    except Exception as e:
-        print(f"❌ Test failed: {e}")
-        return False
-def main():
-    import argparse
-    parser = argparse.ArgumentParser(description="Quantize and save VibeVoice model")
-    parser.add_argument("--model_path", default="/home/deveraux/Desktop/vibevoice/VibeVoice-Large-pt",
-                       help="Path to the original model")
-    parser.add_argument("--output_dir", default="/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit",
-                       help="Output directory for quantized model")
-    parser.add_argument("--bits", type=int, default=4, choices=[4, 8],
-                       help="Quantization bits (4 or 8)")
-    parser.add_argument("--quant_type", default="nf4", choices=["nf4", "fp4"],
-                       help="4-bit quantization type")
-    parser.add_argument("--test", action="store_true",
-                       help="Test the quantized model after saving")
-    args = parser.parse_args()
-    # Update output dir based on bits
-    if str(args.bits) not in args.output_dir:
-        args.output_dir = args.output_dir.replace("4bit", f"{args.bits}bit")
-    # Quantize and save
-    output_path = quantize_and_save_model(
-        args.model_path,
-        args.output_dir,
-        args.bits,
-        args.quant_type
-    )
-    # Test if requested
-    if args.test:
-        test_quantized_model(output_path)
-    print(f"\n🎉 Done! Quantized model ready for distribution at: {output_path}")
-    print(f"\n📦 To share this model:")
-    print(f"1. Upload the entire '{output_path}' directory")
-    print(f"2. Users can load it with the provided script or directly with transformers")
-    print(f"3. The model will load in {args.bits}-bit without additional quantization")
-if __name__ == "__main__":
-    main()

8bit/test_accurate_vram.py DELETED Viewed

@@ -1,207 +0,0 @@
-#!/usr/bin/env python
-"""
-Accurate VRAM measurement for VibeVoice models
-Shows the difference between allocated vs reserved memory
-"""
-import os
-import gc
-import torch
-import subprocess
-import time
-from pathlib import Path
-from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
-def get_gpu_memory_info():
-    """Get detailed GPU memory information"""
-    if not torch.cuda.is_available():
-        return {}
-    # PyTorch memory stats
-    allocated = torch.cuda.memory_allocated() / 1e9
-    reserved = torch.cuda.memory_reserved() / 1e9
-    # Get nvidia-smi info
-    try:
-        result = subprocess.run([
-            'nvidia-smi',
-            '--query-gpu=memory.used,memory.total',
-            '--format=csv,nounits,noheader'
-        ], capture_output=True, text=True)
-        if result.returncode == 0:
-            used, total = map(int, result.stdout.strip().split(','))
-            nvidia_used_gb = used / 1024  # Convert MB to GB
-            nvidia_total_gb = total / 1024
-        else:
-            nvidia_used_gb = 0
-            nvidia_total_gb = 0
-    except:
-        nvidia_used_gb = 0
-        nvidia_total_gb = 0
-    return {
-        'allocated': allocated,
-        'reserved': reserved,
-        'nvidia_smi': nvidia_used_gb,
-        'nvidia_total': nvidia_total_gb
-    }
-def print_memory_report(label, before, after):
-    """Print detailed memory usage report"""
-    print(f"\n{label}:")
-    print(f"  PyTorch Allocated: {before['allocated']:.2f} GB → {after['allocated']:.2f} GB "
-          f"(+{after['allocated'] - before['allocated']:.2f} GB)")
-    print(f"  PyTorch Reserved:  {before['reserved']:.2f} GB → {after['reserved']:.2f} GB "
-          f"(+{after['reserved'] - before['reserved']:.2f} GB)")
-    print(f"  nvidia-smi Total:  {before['nvidia_smi']:.2f} GB → {after['nvidia_smi']:.2f} GB "
-          f"(+{after['nvidia_smi'] - before['nvidia_smi']:.2f} GB)")
-    print(f"  Memory Overhead:   {after['reserved'] - after['allocated']:.2f} GB (PyTorch cache)")
-def clear_gpu_memory():
-    """Aggressively clear GPU memory"""
-    gc.collect()
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-        torch.cuda.synchronize()
-        # Force memory pool cleanup
-        torch.cuda.reset_peak_memory_stats()
-def test_model_memory(model_path, model_name):
-    """Test model with detailed memory tracking"""
-    print(f"\n{'='*70}")
-    print(f"Testing {model_name}")
-    print(f"{'='*70}")
-    # Clear memory and get baseline
-    clear_gpu_memory()
-    time.sleep(2)  # Let memory settle
-    baseline = get_gpu_memory_info()
-    print(f"\nBaseline GPU Memory:")
-    print(f"  PyTorch Allocated: {baseline['allocated']:.2f} GB")
-    print(f"  PyTorch Reserved:  {baseline['reserved']:.2f} GB")
-    print(f"  nvidia-smi Shows:  {baseline['nvidia_smi']:.2f} GB / {baseline['nvidia_total']:.2f} GB")
-    # Load model
-    print(f"\nLoading {model_name}...")
-    load_start = time.time()
-    processor = VibeVoiceProcessor.from_pretrained(model_path)
-    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-        model_path,
-        device_map='cuda',
-        torch_dtype=torch.bfloat16,
-    )
-    model.eval()
-    load_time = time.time() - load_start
-    # Get memory after loading
-    loaded = get_gpu_memory_info()
-    print_memory_report("After Model Loading", baseline, loaded)
-    # Test generation to see peak usage
-    print(f"\nTesting generation...")
-    test_text = "Speaker 1: Testing memory usage. Speaker 2: Let's see the results!"
-    voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
-    speaker_voices = [
-        os.path.join(voices_dir, "en-Alice_woman.wav"),
-        os.path.join(voices_dir, "en-Carter_man.wav")
-    ]
-    inputs = processor(
-        text=[test_text],
-        voice_samples=[speaker_voices],
-        padding=True,
-        return_tensors="pt",
-        return_attention_mask=True,
-    )
-    # Monitor during generation
-    pre_gen = get_gpu_memory_info()
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=None,
-            cfg_scale=1.3,
-            tokenizer=processor.tokenizer,
-            generation_config={'do_sample': False},
-        )
-    post_gen = get_gpu_memory_info()
-    print_memory_report("During Generation", pre_gen, post_gen)
-    # Peak memory stats
-    if torch.cuda.is_available():
-        peak_memory = torch.cuda.max_memory_allocated() / 1e9
-        peak_reserved = torch.cuda.max_memory_reserved() / 1e9
-        print(f"\nPeak Memory Usage:")
-        print(f"  Peak Allocated: {peak_memory:.2f} GB")
-        print(f"  Peak Reserved:  {peak_reserved:.2f} GB")
-    # Clean up
-    del model
-    del processor
-    clear_gpu_memory()
-    return {
-        'name': model_name,
-        'allocated': loaded['allocated'] - baseline['allocated'],
-        'reserved': loaded['reserved'] - baseline['reserved'],
-        'nvidia_smi': loaded['nvidia_smi'] - baseline['nvidia_smi'],
-        'peak_allocated': peak_memory,
-        'peak_reserved': peak_reserved
-    }
-def main():
-    print("="*70)
-    print("ACCURATE VRAM MEASUREMENT FOR VIBEVOICE")
-    print("="*70)
-    print("\nNote: PyTorch reserves extra memory for efficiency.")
-    print("nvidia-smi shows total reserved memory, not just allocated.")
-    models = [
-        {
-            "path": "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-pt",
-            "name": "16-bit Original"
-        },
-        {
-            "path": "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit",
-            "name": "4-bit Quantized"
-        }
-    ]
-    results = []
-    for model_info in models:
-        try:
-            result = test_model_memory(model_info["path"], model_info["name"])
-            results.append(result)
-            time.sleep(5)
-        except Exception as e:
-            print(f"Error testing {model_info['name']}: {e}")
-    # Summary
-    print("\n" + "="*70)
-    print("MEMORY USAGE SUMMARY")
-    print("="*70)
-    print(f"\n{'Model':<20} {'Allocated':<12} {'Reserved':<12} {'nvidia-smi':<12} {'Peak':<12}")
-    print("-"*70)
-    for r in results:
-        print(f"{r['name']:<20} "
-              f"{r['allocated']:<12.2f} "
-              f"{r['reserved']:<12.2f} "
-              f"{r['nvidia_smi']:<12.2f} "
-              f"{r['peak_allocated']:<12.2f}")
-    print("\n💡 Key Insights:")
-    print("- 'Allocated' = Actual model weights in memory")
-    print("- 'Reserved' = Total GPU memory reserved by PyTorch (includes cache)")
-    print("- 'nvidia-smi' = What nvidia-smi reports (includes all overhead)")
-    print("- The difference is PyTorch's memory pool for efficiency")
-if __name__ == "__main__":
-    main()

8bit/use_quantized_model.py DELETED Viewed

@@ -1,70 +0,0 @@
-#!/usr/bin/env python
-"""
-Simple example of using the pre-quantized VibeVoice model
-No need for on-the-fly quantization - loads much faster!
-"""
-import os
-import torch
-from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
-def main():
-    # Path to the pre-quantized model
-    model_path = "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit"
-    print("Loading pre-quantized VibeVoice 4-bit model...")
-    # Load processor
-    processor = VibeVoiceProcessor.from_pretrained(model_path)
-    # Load the pre-quantized model
-    # The quantization config is already saved in the model
-    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
-        model_path,
-        device_map='cuda',
-        torch_dtype=torch.bfloat16,
-    )
-    model.eval()
-    # Check memory usage
-    memory_gb = torch.cuda.memory_allocated() / 1e9
-    print(f"✅ Model loaded! Memory usage: {memory_gb:.1f} GB")
-    # Example generation
-    text = "Speaker 1: Welcome to our podcast! Speaker 2: Thanks for having me!"
-    # Voice samples (using demo voices)
-    voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"
-    speaker_voices = [
-        os.path.join(voices_dir, "en-Alice_woman.wav"),
-        os.path.join(voices_dir, "en-Carter_man.wav")
-    ]
-    # Process inputs
-    inputs = processor(
-        text=[text],
-        voice_samples=[speaker_voices],
-        padding=True,
-        return_tensors="pt",
-        return_attention_mask=True,
-    )
-    # Generate
-    print(f"\nGenerating: '{text}'")
-    with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=None,
-            cfg_scale=1.3,
-            tokenizer=processor.tokenizer,
-            generation_config={'do_sample': False},
-        )
-    # Save output
-    output_path = "quantized_output.wav"
-    processor.save_audio(outputs.speech_outputs[0], output_path=output_path)
-    print(f"✅ Audio saved to: {output_path}")
-if __name__ == "__main__":
-    main()