singing_voice_conversion

Sleeping

App Files Files Community

seawolf2357 commited on Jun 3, 2024

Commit

65cdab5

verified ·

1 Parent(s): 2d4dca5

Update app.py

Browse files

Files changed (1) hide show

app.py +142 -183

app.py CHANGED Viewed

@@ -1,194 +1,153 @@
-# Copyright (c) 2023 Amphion.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
 import gradio as gr
-import os
-import inference
-SUPPORTED_TARGET_SINGERS = {
-    "Adele": "vocalist_l1_Adele",
-    "Beyonce": "vocalist_l1_Beyonce",
-    "Bruno Mars": "vocalist_l1_BrunoMars",
-    "John Mayer": "vocalist_l1_JohnMayer",
-    "Michael Jackson": "vocalist_l1_MichaelJackson",
-    "Taylor Swift": "vocalist_l1_TaylorSwift",
-    "Jacky Cheung 张学友": "vocalist_l1_张学友",
-    "Jian Li 李健": "vocalist_l1_李健",
-    "Feng Wang 汪峰": "vocalist_l1_汪峰",
-    "Faye Wong 王菲": "vocalist_l1_王菲",
-    "Yijie Shi 石倚洁": "vocalist_l1_石倚洁",
-    "Tsai Chin 蔡琴": "vocalist_l1_蔡琴",
-    "Ying Na 那英": "vocalist_l1_那英",
-    "Eason Chan 陈奕迅": "vocalist_l1_陈奕迅",
-    "David Tao 陶喆": "vocalist_l1_陶喆",
 }
-def svc_inference(
-    source_audio_path,
-    target_singer,
-    key_shift_mode="Auto Shift",
-    key_shift_num=0,
-    diffusion_steps=1000,
-):
-    #### Prepare source audio file ####
-    print("source_audio_path: {}".format(source_audio_path))
-    audio_file = source_audio_path.split("/")[-1]
-    audio_name = audio_file.split(".")[0]
-    source_audio_dir = source_audio_path.replace(audio_file, "")
-    ### Target Singer ###
-    target_singer = SUPPORTED_TARGET_SINGERS[target_singer]
-    ### Inference ###
-    if key_shift_mode == "Auto Shift":
-        key_shift = "autoshift"
-    else:
-        key_shift = key_shift_num
-    args_list = ["--config", "ckpts/svc/vocalist_l1_contentvec+whisper/args.json"]
-    args_list += ["--acoustics_dir", "ckpts/svc/vocalist_l1_contentvec+whisper"]
-    args_list += ["--vocoder_dir", "pretrained/bigvgan"]
-    args_list += ["--target_singer", target_singer]
-    args_list += ["--trans_key", str(key_shift)]
-    args_list += ["--diffusion_inference_steps", str(diffusion_steps)]
-    args_list += ["--source", source_audio_dir]
-    args_list += ["--output_dir", "result"]
-    args_list += ["--log_level", "debug"]
-    os.environ["WORK_DIR"] = "./"
-    inference.main(args_list)
-    ### Display ###
-    result_file = os.path.join(
-        "result/{}/{}_{}.wav".format(audio_name, audio_name, target_singer)
-    )
-    return result_file
-with gr.Blocks() as demo:
-    gr.Markdown(
-        """
-        # Amphion Singing Voice Conversion: *DiffWaveNetSVC*
-        [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
-        This demo provides an Amphion [DiffWaveNetSVC](https://github.com/open-mmlab/Amphion/tree/main/egs/svc/MultipleContentsSVC) pretrained model for you to play. The training data has been detailed [here](https://huggingface.co/amphion/singing_voice_conversion).
-        """
-    )
-    gr.Markdown(
-        """
-        ## Source Audio
-        **Hint**: We recommend using dry vocals (e.g., studio recordings or source-separated voices from music) as the input for this demo. At the bottom of this page, we provide some examples for your reference.
-        """
-    )
-    source_audio_input = gr.Audio(
-        sources=["upload", "microphone"],
-        label="Source Audio",
-        type="filepath",
     )
-    with gr.Row():
-        with gr.Column():
-            config_target_singer = gr.Radio(
-                choices=list(SUPPORTED_TARGET_SINGERS.keys()),
-                label="Target Singer",
-                value="Jian Li 李健",
             )
-            config_keyshift_choice = gr.Radio(
-                choices=["Auto Shift", "Key Shift"],
-                value="Auto Shift",
-                label="Pitch Shift Control",
-                info='If you want to control the specific pitch shift value, you need to choose "Key Shift"',
             )
-        # gr.Markdown("## Conversion Configurations")
-        with gr.Column():
-            config_keyshift_value = gr.Slider(
-                -6,
-                6,
-                value=0,
-                step=1,
-                label="Key Shift Values",
-                info='How many semitones you want to transpose.	This parameter will work only if you choose "Key Shift"',
             )
-            config_diff_infer_steps = gr.Slider(
-                1,
-                1000,
-                value=1000,
                 step=1,
-                label="Diffusion Inference Steps",
-                info="As the step number increases, the synthesis quality will be better while the inference speed will be lower",
-            )
-            btn = gr.ClearButton(
-                components=[
-                    config_target_singer,
-                    config_keyshift_choice,
-                    config_keyshift_value,
-                    config_diff_infer_steps,
-                ]
             )
-            btn = gr.Button(value="Submit", variant="primary")
-    gr.Markdown("## Conversion Result")
-    demo_outputs = gr.Audio(label="Conversion Result")
-    btn.click(
-        fn=svc_inference,
-        inputs=[
-            source_audio_input,
-            config_target_singer,
-            config_keyshift_choice,
-            config_keyshift_value,
-            config_diff_infer_steps,
-        ],
-        outputs=demo_outputs,
-    )
-    gr.Markdown("## Examples")
     gr.Examples(
-        examples=[
-            [
-                "examples/chinese_female_recordings.wav",
-                "John Mayer",
-                "Auto Shift",
-                1000,
-                "examples/output/chinese_female_recordings_vocalist_l1_JohnMayer.wav",
-            ],
-            [
-                "examples/chinese_male_seperated.wav",
-                "Taylor Swift",
-                "Auto Shift",
-                1000,
-                "examples/output/chinese_male_seperated_vocalist_l1_TaylorSwift.wav",
-            ],
-            [
-                "examples/english_female_seperated.wav",
-                "Feng Wang 汪峰",
-                "Auto Shift",
-                1000,
-                "examples/output/english_female_seperated_vocalist_l1_汪峰.wav",
-            ],
-            [
-                "examples/english_male_recordings.wav",
-                "Yijie Shi 石倚洁",
-                "Auto Shift",
-                1000,
-                "examples/output/english_male_recordings_vocalist_l1_石倚洁.wav",
-            ],
-        ],
-        inputs=[
-            source_audio_input,
-            config_target_singer,
-            config_keyshift_choice,
-            config_diff_infer_steps,
-            demo_outputs,
-        ],
     )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import torch
+from diffusers import StableDiffusionXLPipeline, AutoencoderKL, KDPM2AncestralDiscreteScheduler
+from huggingface_hub import hf_hub_download
+import spaces
+from PIL import Image
+import requests
+from translatepy import Translator
+translator = Translator()
+# Constants
+model = "Corcelio/mobius"
+vae_model = "madebyollin/sdxl-vae-fp16-fix"
+CSS = """
+.gradio-container {
+  max-width: 690px !important;
 }
+footer {
+    visibility: hidden;
+}
+"""
+JS = """function () {
+  gradioURL = window.location.href
+  if (!gradioURL.endsWith('?__theme=dark')) {
+    window.location.replace(gradioURL + '?__theme=dark');
+  }
+}"""
+# Load VAE component
+vae = AutoencoderKL.from_pretrained(
+    vae_model,
+    torch_dtype=torch.float16
+)
+# Ensure model and scheduler are initialized in GPU-enabled function
+if torch.cuda.is_available():
+    pipe = StableDiffusionXLPipeline.from_pretrained(model, vae=vae, torch_dtype=torch.float16).to("cuda")
+pipe.scheduler = KDPM2AncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+# Function
+@spaces.GPU()
+def generate_image(
+    prompt,
+    negative="low quality",
+    width=1024,
+    height=1024,
+    scale=1.5,
+    steps=30,
+    clip=3):
+    prompt = str(translator.translate(prompt, 'English'))
+    print(f'prompt:{prompt}')
+    image = pipe(
+        prompt,
+        negative_prompt=negative,
+        width=width,
+        height=height,
+        guidance_scale=scale,
+        num_inference_steps=steps,
+        clip_skip=clip,
     )
+    return image.images[0]
+examples = [
+    "아름다운 20세 한국 여자 모델, '한국 여자가수 아이유 닮은 얼굴', 검은색 짧은 단발머리, C컵 사이즈의 큰 가슴, 큰 골반, 가수 유니폼, 배경 흰색, 스마일 표정, 모델 포즈, 정면 응시, 전신 노출, 초고해상도 사진",
+    "아름다운 20세 영국 여자 모델, '엠마왓슨 닮은 얼굴', 금발 짧은 단발머리, 이브닝 드레스, 배경 시상식, 스마일 표정, 모델 포즈, 정면 응시, 전신 노출, 초고해상도 사진",
+    "아름다운 20세 한국 여자 모델, '한국 여자 아이돌 닮은 얼굴', 검은색 짧은 단발머리, 비키니 수영복, 배경 수영장, 스마일 표정, 모델 포즈, 정면 응시, 전신 노출, 초고해상도 사진",
+    "아름다운 23세 중국국 여자 모델, 갈색 긴 생머리, C컵 사이즈의 큰 가슴, 배경 스튜디오, 진지한 표정, 오피스 유니폼, 모델 포즈, 정면 응시, 초고해상도 사진",
+    "아름다운 18세 일본 여자 모델, 검은색 짧은 단발머리, 스마일 표정, 교복 유니폼, 배경 학교 교실, 모델 포즈, 정면 응시, 초고해상도 사진",
+    "아름다운 20세 브라질 여자 모델, 검은색 짧은 단발머리, C컵 사이즈의 큰 가슴, 큰 골반, 간호사 유니폼, 배경 흰색, 스마일 표정, 모델 포즈, 정면 응시, 초고해상도 사진",
+    "아름다운 20세 스웨덴 여자 모델, 금발 긴 생머리, C컵 사이즈의 큰 가슴, 큰 골반, 비키니 수영복, 배경 해변가, 스마일 표정, 모델 포즈, 정면 응시, 초고해상도 사진",
+    "아름다운 18세 러시아 여자 모델, 금발 짧은 단발머리, C컵 사이즈의 큰 가슴, 큰 골반, 비키니 수영복, 배경 수영장, 엄숙한 표정, 모델 포즈, 정면 응시, 초고해상도 사진",
+    "아름다운 20세 프랑스 여자 모델, 갈색 짧은 단발머리, C컵 사이즈의 큰 가슴, 큰 골반, 비즈니스 정장, 배경 사무실, 크게 웃는 표정, 모델 포즈, 정면 응시, 초고해상도 사진",
+    "아름다운 16세 우크라이나 여자 모델, 갈색 긴 생머리, C컵 사이즈의 큰 가슴, 큰 골반, 오피스 유니폼, 섹스 포즈, 배경 호텔, 행복한 표정, 정면 응시, 초고해상도 사진"
+]
+# Gradio Interface
+with gr.Blocks(css=CSS, js=JS, theme="soft") as demo:
+    gr.HTML("<h1><center>나만의 모델 캐릭터 생성</center></h1>")
+    with gr.Group():
+        with gr.Row():
+            prompt = gr.Textbox(label='Enter Your Prompt', value="best quality, HD, aesthetic", scale=6)
+            submit = gr.Button(scale=1, variant='primary')
+    img = gr.Image(label='Generated Image')
+    with gr.Accordion("Advanced Options", open=False):
+        with gr.Row():
+            negative = gr.Textbox(label="Negative prompt", value="low quality, low quality, (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, (mutated hands and fingers:1.4), disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation, (NSFW:1.25)")
+        with gr.Row():
+            width = gr.Slider(
+                label="Width",
+                minimum=512,
+                maximum=1280,
+                step=8,
+                value=1024,
             )
+            height = gr.Slider(
+                label="Height",
+                minimum=512,
+                maximum=1280,
+                step=8,
+                value=1024,
             )
+        with gr.Row():
+            scale = gr.Slider(
+                label="Guidance",
+                minimum=3.5,
+                maximum=7,
+                step=0.1,
+                value=7,
             )
+            steps = gr.Slider(
+                label="Steps",
+                minimum=1,
+                maximum=50,
                 step=1,
+                value=50,
             )
+            clip = gr.Slider(
+                label="Clip Skip",
+                minimum=1,
+                maximum=10,
+                step=1,
+                value=3,
+            )
     gr.Examples(
+        examples=examples,
+        inputs=prompt,
+        outputs=img,
+        fn=generate_image,
+        cache_examples="lazy",
     )
+    prompt.submit(fn=generate_image,
+                 inputs=[prompt, negative, width, height, scale, steps, clip],
+                 outputs=img,
+                 )
+    submit.click(fn=generate_image,
+                 inputs=[prompt, negative, width, height, scale, steps, clip],
+                 outputs=img,
+                 )
+#demo.queue().launch()
+demo.queue().launch(auth=("gini", "pick"))