|
|
|
""" |
|
Load VibeVoice 4-bit in ~7GB VRAM |
|
Minimize PyTorch's memory pool overhead |
|
""" |
|
|
|
import os |
|
import gc |
|
import torch |
|
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference |
|
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor |
|
|
|
|
|
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True' |
|
|
|
|
|
torch.cuda.set_per_process_memory_fraction(0.75) |
|
|
|
def get_memory_stats(): |
|
"""Get detailed memory statistics""" |
|
if torch.cuda.is_available(): |
|
allocated = torch.cuda.memory_allocated() / 1e9 |
|
reserved = torch.cuda.memory_reserved() / 1e9 |
|
free = torch.cuda.mem_get_info()[0] / 1e9 |
|
total = torch.cuda.mem_get_info()[1] / 1e9 |
|
return { |
|
'allocated': allocated, |
|
'reserved': reserved, |
|
'free': free, |
|
'total': total, |
|
'used': total - free |
|
} |
|
return {} |
|
|
|
def load_model_minimal(model_path): |
|
"""Load model with absolute minimal memory overhead""" |
|
print("Loading 4-bit model with minimal overhead...") |
|
|
|
|
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
torch.cuda.reset_peak_memory_stats() |
|
|
|
|
|
stats = get_memory_stats() |
|
print(f"\nInitial state:") |
|
print(f" GPU total: {stats['total']:.2f} GB") |
|
print(f" GPU used: {stats['used']:.2f} GB") |
|
print(f" GPU free: {stats['free']:.2f} GB") |
|
|
|
|
|
processor = VibeVoiceProcessor.from_pretrained(model_path) |
|
|
|
|
|
model = VibeVoiceForConditionalGenerationInference.from_pretrained( |
|
model_path, |
|
device_map='cuda', |
|
torch_dtype=torch.bfloat16, |
|
low_cpu_mem_usage=True, |
|
) |
|
|
|
|
|
model.eval() |
|
model.requires_grad_(False) |
|
|
|
|
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
|
|
stats = get_memory_stats() |
|
print(f"\nAfter loading:") |
|
print(f" Allocated: {stats['allocated']:.2f} GB (actual model)") |
|
print(f" Reserved: {stats['reserved']:.2f} GB (PyTorch total)") |
|
print(f" Overhead: {stats['reserved'] - stats['allocated']:.2f} GB") |
|
print(f" System reports: {stats['used']:.2f} GB used") |
|
|
|
return model, processor |
|
|
|
def generate_minimal(model, processor, text, speaker_voices): |
|
"""Generate with minimal memory overhead""" |
|
|
|
inputs = processor( |
|
text=[text], |
|
voice_samples=[speaker_voices], |
|
padding=True, |
|
return_tensors="pt", |
|
return_attention_mask=True, |
|
) |
|
|
|
|
|
with torch.no_grad(): |
|
|
|
torch.cuda.empty_cache() |
|
|
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=None, |
|
cfg_scale=1.3, |
|
tokenizer=processor.tokenizer, |
|
generation_config={ |
|
'do_sample': False, |
|
'use_cache': True, |
|
}, |
|
) |
|
|
|
|
|
del inputs |
|
gc.collect() |
|
|
|
return outputs |
|
|
|
def try_memory_reduction_tricks(): |
|
"""Additional tricks to reduce memory""" |
|
print("\n🔧 Applying memory reduction tricks...") |
|
|
|
|
|
if hasattr(torch.cuda, 'set_allocator_settings'): |
|
torch.cuda.set_allocator_settings(backend='native') |
|
|
|
|
|
torch.cuda.synchronize() |
|
torch.cuda.empty_cache() |
|
|
|
|
|
allocated_before = torch.cuda.memory_allocated() |
|
reserved_before = torch.cuda.memory_reserved() |
|
|
|
|
|
torch.cuda.reset_peak_memory_stats() |
|
torch.cuda.empty_cache() |
|
|
|
allocated_after = torch.cuda.memory_allocated() |
|
reserved_after = torch.cuda.memory_reserved() |
|
|
|
if reserved_before > reserved_after: |
|
print(f" ✓ Freed {(reserved_before - reserved_after) / 1e9:.2f} GB") |
|
|
|
def main(): |
|
|
|
model_path = "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit" |
|
voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices" |
|
|
|
print("="*60) |
|
print("VIBEVOICE 4-BIT - 7GB TARGET MODE") |
|
print("="*60) |
|
|
|
|
|
try_memory_reduction_tricks() |
|
|
|
|
|
model, processor = load_model_minimal(model_path) |
|
|
|
|
|
try_memory_reduction_tricks() |
|
|
|
|
|
test_text = "Speaker 1: Testing minimal memory. Speaker 2: Hope it works!" |
|
speaker_voices = [ |
|
os.path.join(voices_dir, "en-Alice_woman.wav"), |
|
os.path.join(voices_dir, "en-Carter_man.wav") |
|
] |
|
|
|
print("\n🎤 Generating audio...") |
|
outputs = generate_minimal(model, processor, test_text, speaker_voices) |
|
|
|
|
|
stats = get_memory_stats() |
|
print(f"\nFinal memory usage:") |
|
print(f" Allocated: {stats['allocated']:.2f} GB") |
|
print(f" Reserved: {stats['reserved']:.2f} GB") |
|
print(f" Total used: {stats['used']:.2f} GB") |
|
|
|
|
|
output_path = "7gb_target_output.wav" |
|
processor.save_audio(outputs.speech_outputs[0], output_path=output_path) |
|
print(f"\n✅ Audio saved to: {output_path}") |
|
|
|
|
|
print("\n📊 Analysis:") |
|
overhead = stats['reserved'] - stats['allocated'] |
|
print(f"The {overhead:.2f} GB overhead comes from:") |
|
print("- PyTorch memory pool fragmentation") |
|
print("- CUDA kernel workspace") |
|
print("- Temporary buffers for operations") |
|
print("\n💡 The model IS 6.6GB, but PyTorch needs workspace!") |
|
|
|
|
|
print("\n🚀 To truly get to 7GB total, you could:") |
|
print("1. Use bnb 3-bit quantization (experimental)") |
|
print("2. Prune some model layers") |
|
print("3. Use a custom CUDA allocator") |
|
print("4. Compile with torch.compile() for memory efficiency") |
|
|
|
if __name__ == "__main__": |
|
main() |