mini-omni-demo

Sleeping

App Files Files Community

gpt-omni commited on Sep 5, 2024

Commit

411819d

1 Parent(s): 9b186d7

update

Browse files

Files changed (1) hide show

app.py +16 -6

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ import soundfile as sf
 from litgpt.model import GPT, Config
 from lightning.fabric.utilities.load import _lazy_load as lazy_load
 from utils.snac_utils import layershift, reconscruct_snac, reconstruct_tensors, get_time_str
-from utils.snac_utils import get_snac, generate_audio_data
 import whisper
 from tqdm import tqdm
 from huggingface_hub import snapshot_download
@@ -80,19 +80,19 @@ if not os.path.exists(ckpt_dir):
 snacmodel = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
 whispermodel = whisper.load_model("small").to(device)
 text_tokenizer = Tokenizer(ckpt_dir)
-fabric = L.Fabric(devices=1, strategy="auto")
 config = Config.from_file(ckpt_dir + "/model_config.yaml")
 config.post_adapter = False
 model = GPT(config, device=device)
-# model = fabric.setup(model)
 state_dict = lazy_load(ckpt_dir + "/lit_model.pth")
 model.load_state_dict(state_dict, strict=True)
 model = model.to(device)
 model.eval()
 def get_input_ids_whisper_ATBatch(mel, leng, whispermodel, device):
     with torch.no_grad():
         mel = mel.unsqueeze(0).to(device)
@@ -128,6 +128,7 @@ def get_input_ids_whisper_ATBatch(mel, leng, whispermodel, device):
     return torch.stack([audio_feature, audio_feature]), stacked_inputids
 def next_token_batch(
     model: GPT,
     audio_features: torch.tensor,
@@ -162,9 +163,19 @@ def load_audio(path):
     mel = whisper.log_mel_spectrogram(audio)
     return mel, int(duration_ms / 20) + 1
 # @torch.inference_mode()
-@spaces.GPU
 def run_AT_batch_stream(
                         audio_path,
                         stream_stride=4,
@@ -178,11 +189,10 @@ def run_AT_batch_stream(
     assert os.path.exists(audio_path), f"audio file {audio_path} not found"
-    # with self.fabric.init_tensor():
     model.set_kv_cache(batch_size=2)
     mel, leng = load_audio(audio_path)
-    audio_feature, input_ids = get_input_ids_whisper_ATBatch(mel, leng, self.whispermodel, self.device)
     T = input_ids[0].size(1)
     device = input_ids[0].device

 from litgpt.model import GPT, Config
 from lightning.fabric.utilities.load import _lazy_load as lazy_load
 from utils.snac_utils import layershift, reconscruct_snac, reconstruct_tensors, get_time_str
+from utils.snac_utils import get_snac
 import whisper
 from tqdm import tqdm
 from huggingface_hub import snapshot_download
 snacmodel = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to(device)
 whispermodel = whisper.load_model("small").to(device)
 text_tokenizer = Tokenizer(ckpt_dir)
+# fabric = L.Fabric(devices=1, strategy="auto")
 config = Config.from_file(ckpt_dir + "/model_config.yaml")
 config.post_adapter = False
 model = GPT(config, device=device)
 state_dict = lazy_load(ckpt_dir + "/lit_model.pth")
 model.load_state_dict(state_dict, strict=True)
 model = model.to(device)
 model.eval()
+@spaces.GPU
 def get_input_ids_whisper_ATBatch(mel, leng, whispermodel, device):
     with torch.no_grad():
         mel = mel.unsqueeze(0).to(device)
     return torch.stack([audio_feature, audio_feature]), stacked_inputids
+@spaces.GPU
 def next_token_batch(
     model: GPT,
     audio_features: torch.tensor,
     mel = whisper.log_mel_spectrogram(audio)
     return mel, int(duration_ms / 20) + 1
+@spaces.GPU
+def generate_audio_data(snac_tokens, snacmodel, device=None):
+    audio = reconstruct_tensors(snac_tokens, device)
+    with torch.inference_mode():
+        audio_hat = snacmodel.decode(audio)
+    audio_data = audio_hat.cpu().numpy().astype(np.float64) * 32768.0
+    audio_data = audio_data.astype(np.int16)
+    audio_data = audio_data.tobytes()
+    return audio_data
 # @torch.inference_mode()
 def run_AT_batch_stream(
                         audio_path,
                         stream_stride=4,
     assert os.path.exists(audio_path), f"audio file {audio_path} not found"
     model.set_kv_cache(batch_size=2)
     mel, leng = load_audio(audio_path)
+    audio_feature, input_ids = get_input_ids_whisper_ATBatch(mel, leng, whispermodel, device)
     T = input_ids[0].size(1)
     device = input_ids[0].device