#!/usr/bin/env python
"""
Load and use the 4-bit quantized VibeVoice model
"""

import torch
from transformers import BitsAndBytesConfig
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor

def load_quantized_model(model_path="/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit"):
    """Load the pre-quantized VibeVoice model"""
    
    print("Loading 4-bit quantized VibeVoice model...")
    
    # The model is already quantized, but we need to specify the config
    # to ensure proper loading of quantized weights
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4'
    )
    
    # Load processor
    processor = VibeVoiceProcessor.from_pretrained(model_path)
    
    # Load model
    model = VibeVoiceForConditionalGenerationInference.from_pretrained(
        model_path,
        quantization_config=bnb_config,
        device_map='cuda',
        torch_dtype=torch.bfloat16,
    )
    
    model.eval()
    
    print("✅ Model loaded successfully!")
    print(f"💾 Memory usage: {torch.cuda.memory_allocated() / 1e9:.1f} GB")
    
    return model, processor

# Example usage
if __name__ == "__main__":
    model, processor = load_quantized_model()
    
    # Generate audio
    text = "Speaker 1: Hello! Speaker 2: Hi there!"
    inputs = processor(
        text=[text],
        voice_samples=[["path/to/voice1.wav", "path/to/voice2.wav"]],
        padding=True,
        return_tensors="pt",
    )
    
    with torch.no_grad():
        outputs = model.generate(**inputs)
    
    # Save audio
    processor.save_audio(outputs.speech_outputs[0], "output.wav")