Spaces:
Sleeping
Sleeping
| from boson_multimodal.serve.serve_engine import HiggsAudioServeEngine, HiggsAudioResponse | |
| from boson_multimodal.data_types import ChatMLSample, Message, AudioContent | |
| import torch | |
| import torchaudio | |
| import time | |
| import click | |
| MODEL_PATH = "bosonai/higgs-audio-v2-generation-3B-base" | |
| AUDIO_TOKENIZER_PATH = "bosonai/higgs-audio-v2-tokenizer" | |
| system_prompt = ( | |
| "Generate audio following instruction.\n\n<|scene_desc_start|>\nAudio is recorded from a quiet room.\n<|scene_desc_end|>" | |
| ) | |
| messages = [ | |
| Message( | |
| role="system", | |
| content=system_prompt, | |
| ), | |
| Message( | |
| role="user", | |
| content="The sun rises in the east and sets in the west. This simple fact has been observed by humans for thousands of years.", | |
| ), | |
| ] | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| serve_engine = HiggsAudioServeEngine(MODEL_PATH, AUDIO_TOKENIZER_PATH, device=device) | |
| output: HiggsAudioResponse = serve_engine.generate( | |
| chat_ml_sample=ChatMLSample(messages=messages), | |
| max_new_tokens=1024, | |
| temperature=0.3, | |
| top_p=0.95, | |
| top_k=50, | |
| stop_strings=["<|end_of_text|>", "<|eot_id|>"], | |
| ) | |
| torchaudio.save(f"output.wav", torch.from_numpy(output.audio)[None, :], output.sampling_rate) | |