VoiceCraft_gradio

Running

App Files Files Community

jason-on-salt-a40 commited on Apr 20, 2024

Commit

b1f4e2f

1 Parent(s): 579d79b

fix space error. fix encodec download path

Browse files

Files changed (1) hide show

app.py +9 -6

app.py CHANGED Viewed

@@ -63,7 +63,7 @@ class WhisperModel:
     def transcribe(self, audio_path):
         return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
-@spaces.GPU(duration=120)
 class WhisperxModel:
     def __init__(self, model_name, align_model: WhisperxAlignModel):
         from whisperx import load_model
@@ -100,7 +100,7 @@ def load_models(whisper_backend_name, whisper_model_name, alignment_model_name,
     encodec_fn = f"{MODELS_PATH}/encodec_4cb2048_giga.th"
     if not os.path.exists(encodec_fn):
-        os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th")
     voicecraft_model = {
         "config": config,
@@ -114,9 +114,11 @@ def load_models(whisper_backend_name, whisper_model_name, alignment_model_name,
 def get_transcribe_state(segments):
     words_info = [word_info for segment in segments for word_info in segment["words"]]
     return {
         "segments": segments,
-        "transcript": " ".join([segment["text"] for segment in segments]),
         "words_info": words_info,
         "transcript_with_start_time": " ".join([f"{word['start']} {word['word']}" for word in words_info]),
         "transcript_with_end_time": " ".join([f"{word['word']} {word['end']}" for word in words_info]),
@@ -140,7 +142,7 @@ def transcribe(seed, audio_path):
         state
     ]
 def align_segments(transcript, audio_path):
     from aeneas.executetask import ExecuteTask
     from aeneas.task import Task
@@ -363,7 +365,7 @@ If disabled, you should write the target transcript yourself:</br>
  - In Edit mode write full prompt</br>
 """
-demo_original_transcript = " But when I had approached so near to them, the common object, which the sense deceives, lost not by distance any of its marks."
 demo_text = {
     "TTS": {
@@ -603,6 +605,7 @@ if __name__ == "__main__":
     parser.add_argument("--models-path", default="./pretrained_models", help="Path to voicecraft models directory")
     parser.add_argument("--port", default=7860, type=int, help="App port")
     parser.add_argument("--share", action="store_true", help="Launch with public url")
     os.environ["USER"] = os.getenv("USER", "user")
     args = parser.parse_args()
@@ -611,4 +614,4 @@ if __name__ == "__main__":
     MODELS_PATH = args.models_path
     app = get_app()
-    app.queue().launch(share=args.share, server_port=args.port)

     def transcribe(self, audio_path):
         return self.model.transcribe(audio_path, suppress_tokens=self.supress_tokens, word_timestamps=True)["segments"]
 class WhisperxModel:
     def __init__(self, model_name, align_model: WhisperxAlignModel):
         from whisperx import load_model
     encodec_fn = f"{MODELS_PATH}/encodec_4cb2048_giga.th"
     if not os.path.exists(encodec_fn):
+        os.system(f"wget https://huggingface.co/pyp1/VoiceCraft/resolve/main/encodec_4cb2048_giga.th -O " + encodec_fn)
     voicecraft_model = {
         "config": config,
 def get_transcribe_state(segments):
     words_info = [word_info for segment in segments for word_info in segment["words"]]
+    transcript = " ".join([segment["text"] for segment in segments])
+    transcript = transcript[1:] if transcript[0] == " " else transcript
     return {
         "segments": segments,
+        "transcript": transcript,
         "words_info": words_info,
         "transcript_with_start_time": " ".join([f"{word['start']} {word['word']}" for word in words_info]),
         "transcript_with_end_time": " ".join([f"{word['word']} {word['end']}" for word in words_info]),
         state
     ]
+@spaces.GPU(duration=60)
 def align_segments(transcript, audio_path):
     from aeneas.executetask import ExecuteTask
     from aeneas.task import Task
  - In Edit mode write full prompt</br>
 """
+demo_original_transcript = "But when I had approached so near to them, the common object, which the sense deceives, lost not by distance any of its marks."
 demo_text = {
     "TTS": {
     parser.add_argument("--models-path", default="./pretrained_models", help="Path to voicecraft models directory")
     parser.add_argument("--port", default=7860, type=int, help="App port")
     parser.add_argument("--share", action="store_true", help="Launch with public url")
+    parser.add_argument("--server_name", default="127.0.0.1", type=str, help="Server name for launching the app. 127.0.0.1 for localhost; 0.0.0.0 to allow access from other machines in the local network. Might also give access to external users depends on the firewall settings.")
     os.environ["USER"] = os.getenv("USER", "user")
     args = parser.parse_args()
     MODELS_PATH = args.models_path
     app = get_app()
+    app.queue().launch(share=args.share, server_name=args.server_name, server_port=args.port)