|
|
|
""" |
|
Simple example of using the pre-quantized VibeVoice model |
|
No need for on-the-fly quantization - loads much faster! |
|
""" |
|
|
|
import os |
|
import torch |
|
from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference |
|
from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor |
|
|
|
def main(): |
|
|
|
model_path = "/home/deveraux/Desktop/vibevoice/VibeVoice-Large-4bit" |
|
|
|
print("Loading pre-quantized VibeVoice 4-bit model...") |
|
|
|
|
|
processor = VibeVoiceProcessor.from_pretrained(model_path) |
|
|
|
|
|
|
|
model = VibeVoiceForConditionalGenerationInference.from_pretrained( |
|
model_path, |
|
device_map='cuda', |
|
torch_dtype=torch.bfloat16, |
|
) |
|
model.eval() |
|
|
|
|
|
memory_gb = torch.cuda.memory_allocated() / 1e9 |
|
print(f"✅ Model loaded! Memory usage: {memory_gb:.1f} GB") |
|
|
|
|
|
text = "Speaker 1: Welcome to our podcast! Speaker 2: Thanks for having me!" |
|
|
|
|
|
voices_dir = "/home/deveraux/Desktop/vibevoice/VibeVoice-main/demo/voices" |
|
speaker_voices = [ |
|
os.path.join(voices_dir, "en-Alice_woman.wav"), |
|
os.path.join(voices_dir, "en-Carter_man.wav") |
|
] |
|
|
|
|
|
inputs = processor( |
|
text=[text], |
|
voice_samples=[speaker_voices], |
|
padding=True, |
|
return_tensors="pt", |
|
return_attention_mask=True, |
|
) |
|
|
|
|
|
print(f"\nGenerating: '{text}'") |
|
with torch.no_grad(): |
|
outputs = model.generate( |
|
**inputs, |
|
max_new_tokens=None, |
|
cfg_scale=1.3, |
|
tokenizer=processor.tokenizer, |
|
generation_config={'do_sample': False}, |
|
) |
|
|
|
|
|
output_path = "quantized_output.wav" |
|
processor.save_audio(outputs.speech_outputs[0], output_path=output_path) |
|
print(f"✅ Audio saved to: {output_path}") |
|
|
|
if __name__ == "__main__": |
|
main() |