VibeVoice7b-low-vram / 4bit /vibevoice_7gb_target.py

Parker

Upload 5 files

2daa24e verified 4 days ago

6.31 kB

	#!/usr/bin/env python
	"""
	Load VibeVoice 4-bit in ~7GB VRAM
	Minimize PyTorch's memory pool overhead
	"""

	import os
	import gc
	import torch
	from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
	from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor

	# CRITICAL: Set these BEFORE any CUDA operations
	os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128,expandable_segments:True'

	# Reduce memory fraction to force PyTorch to be more conservative
	torch.cuda.set_per_process_memory_fraction(0.75) # This limits reserved memory

	def get_memory_stats():
	"""Get detailed memory statistics"""
	if torch.cuda.is_available():
	allocated = torch.cuda.memory_allocated() / 1e9
	reserved = torch.cuda.memory_reserved() / 1e9
	free = torch.cuda.mem_get_info()[0] / 1e9
	total = torch.cuda.mem_get_info()[1] / 1e9
	return {
	'allocated': allocated,
	'reserved': reserved,
	'free': free,
	'total': total,
	'used': total - free
	}
	return {}

	def load_model_minimal(model_path):
	"""Load model with absolute minimal memory overhead"""
	print("Loading 4-bit model with minimal overhead...")

	# Start clean
	gc.collect()
	torch.cuda.empty_cache()
	torch.cuda.reset_peak_memory_stats()

	# Report initial state
	stats = get_memory_stats()
	print(f"\nInitial state:")
	print(f" GPU total: {stats['total']:.2f} GB")
	print(f" GPU used: {stats['used']:.2f} GB")
	print(f" GPU free: {stats['free']:.2f} GB")

	# Load processor
	processor = VibeVoiceProcessor.from_pretrained(model_path)

	# Load model - let it use default device map
	model = VibeVoiceForConditionalGenerationInference.from_pretrained(
	model_path,
	device_map='cuda',
	torch_dtype=torch.bfloat16,
	low_cpu_mem_usage=True,
	)

	# Immediately set to eval and disable gradients
	model.eval()
	model.requires_grad_(False)

	# Force cleanup
	gc.collect()
	torch.cuda.empty_cache()

	# Report after loading
	stats = get_memory_stats()
	print(f"\nAfter loading:")
	print(f" Allocated: {stats['allocated']:.2f} GB (actual model)")
	print(f" Reserved: {stats['reserved']:.2f} GB (PyTorch total)")
	print(f" Overhead: {stats['reserved'] - stats['allocated']:.2f} GB")
	print(f" System reports: {stats['used']:.2f} GB used")

	return model, processor

	def generate_minimal(model, processor, text, speaker_voices):
	"""Generate with minimal memory overhead"""
	# Process inputs
	inputs = processor(
	text=[text],
	voice_samples=[speaker_voices],
	padding=True,
	return_tensors="pt",
	return_attention_mask=True,
	)

	# Disable caching to save memory during generation
	with torch.no_grad():
	# Temporarily reduce memory fragmentation
	torch.cuda.empty_cache()

	outputs = model.generate(
	**inputs,
	max_new_tokens=None,
	cfg_scale=1.3,
	tokenizer=processor.tokenizer,
	generation_config={
	'do_sample': False,
	'use_cache': True, # Actually, keeping cache can be more efficient
	},
	)

	# Cleanup
	del inputs
	gc.collect()

	return outputs

	def try_memory_reduction_tricks():
	"""Additional tricks to reduce memory"""
	print("\n🔧 Applying memory reduction tricks...")

	# 1. Reduce CUDA kernel reservation
	if hasattr(torch.cuda, 'set_allocator_settings'):
	torch.cuda.set_allocator_settings(backend='native')

	# 2. Force synchronization and cleanup
	torch.cuda.synchronize()
	torch.cuda.empty_cache()

	# 3. Try to release unused cached blocks
	allocated_before = torch.cuda.memory_allocated()
	reserved_before = torch.cuda.memory_reserved()

	# This might help
	torch.cuda.reset_peak_memory_stats()
	torch.cuda.empty_cache()

	allocated_after = torch.cuda.memory_allocated()
	reserved_after = torch.cuda.memory_reserved()

	if reserved_before > reserved_after:
	print(f" ✓ Freed {(reserved_before - reserved_after) / 1e9:.2f} GB")

	def main():
	# Paths
	model_path = "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit"
	voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices"

	print("="*60)
	print("VIBEVOICE 4-BIT - 7GB TARGET MODE")
	print("="*60)

	# Apply tricks before loading
	try_memory_reduction_tricks()

	# Load model
	model, processor = load_model_minimal(model_path)

	# Try to compact memory after loading
	try_memory_reduction_tricks()

	# Test generation
	test_text = "Speaker 1: Testing minimal memory. Speaker 2: Hope it works!"
	speaker_voices = [
	os.path.join(voices_dir, "en-Alice_woman.wav"),
	os.path.join(voices_dir, "en-Carter_man.wav")
	]

	print("\n🎤 Generating audio...")
	outputs = generate_minimal(model, processor, test_text, speaker_voices)

	# Final stats
	stats = get_memory_stats()
	print(f"\nFinal memory usage:")
	print(f" Allocated: {stats['allocated']:.2f} GB")
	print(f" Reserved: {stats['reserved']:.2f} GB")
	print(f" Total used: {stats['used']:.2f} GB")

	# Save output
	output_path = "7gb_target_output.wav"
	processor.save_audio(outputs.speech_outputs[0], output_path=output_path)
	print(f"\n✅ Audio saved to: {output_path}")

	# Analysis
	print("\n📊 Analysis:")
	overhead = stats['reserved'] - stats['allocated']
	print(f"The {overhead:.2f} GB overhead comes from:")
	print("- PyTorch memory pool fragmentation")
	print("- CUDA kernel workspace")
	print("- Temporary buffers for operations")
	print("\n💡 The model IS 6.6GB, but PyTorch needs workspace!")

	# Extreme option
	print("\n🚀 To truly get to 7GB total, you could:")
	print("1. Use bnb 3-bit quantization (experimental)")
	print("2. Prune some model layers")
	print("3. Use a custom CUDA allocator")
	print("4. Compile with torch.compile() for memory efficiency")

	if __name__ == "__main__":
	main()