seawolf2357 commited on
Commit
65cdab5
ยท
verified ยท
1 Parent(s): 2d4dca5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +142 -183
app.py CHANGED
@@ -1,194 +1,153 @@
1
- # Copyright (c) 2023 Amphion.
2
- #
3
- # This source code is licensed under the MIT license found in the
4
- # LICENSE file in the root directory of this source tree.
5
-
6
  import gradio as gr
7
- import os
8
- import inference
9
-
10
- SUPPORTED_TARGET_SINGERS = {
11
- "Adele": "vocalist_l1_Adele",
12
- "Beyonce": "vocalist_l1_Beyonce",
13
- "Bruno Mars": "vocalist_l1_BrunoMars",
14
- "John Mayer": "vocalist_l1_JohnMayer",
15
- "Michael Jackson": "vocalist_l1_MichaelJackson",
16
- "Taylor Swift": "vocalist_l1_TaylorSwift",
17
- "Jacky Cheung ๅผ ๅญฆๅ‹": "vocalist_l1_ๅผ ๅญฆๅ‹",
18
- "Jian Li ๆŽๅฅ": "vocalist_l1_ๆŽๅฅ",
19
- "Feng Wang ๆฑชๅณฐ": "vocalist_l1_ๆฑชๅณฐ",
20
- "Faye Wong ็Ž‹่ฒ": "vocalist_l1_็Ž‹่ฒ",
21
- "Yijie Shi ็Ÿณๅ€šๆด": "vocalist_l1_็Ÿณๅ€šๆด",
22
- "Tsai Chin ่”ก็ด": "vocalist_l1_่”ก็ด",
23
- "Ying Na ้‚ฃ่‹ฑ": "vocalist_l1_้‚ฃ่‹ฑ",
24
- "Eason Chan ้™ˆๅฅ•่ฟ…": "vocalist_l1_้™ˆๅฅ•่ฟ…",
25
- "David Tao ้™ถๅ–†": "vocalist_l1_้™ถๅ–†",
26
  }
27
-
28
-
29
- def svc_inference(
30
- source_audio_path,
31
- target_singer,
32
- key_shift_mode="Auto Shift",
33
- key_shift_num=0,
34
- diffusion_steps=1000,
35
- ):
36
- #### Prepare source audio file ####
37
- print("source_audio_path: {}".format(source_audio_path))
38
- audio_file = source_audio_path.split("/")[-1]
39
- audio_name = audio_file.split(".")[0]
40
- source_audio_dir = source_audio_path.replace(audio_file, "")
41
-
42
- ### Target Singer ###
43
- target_singer = SUPPORTED_TARGET_SINGERS[target_singer]
44
-
45
- ### Inference ###
46
- if key_shift_mode == "Auto Shift":
47
- key_shift = "autoshift"
48
- else:
49
- key_shift = key_shift_num
50
-
51
- args_list = ["--config", "ckpts/svc/vocalist_l1_contentvec+whisper/args.json"]
52
- args_list += ["--acoustics_dir", "ckpts/svc/vocalist_l1_contentvec+whisper"]
53
- args_list += ["--vocoder_dir", "pretrained/bigvgan"]
54
- args_list += ["--target_singer", target_singer]
55
- args_list += ["--trans_key", str(key_shift)]
56
- args_list += ["--diffusion_inference_steps", str(diffusion_steps)]
57
- args_list += ["--source", source_audio_dir]
58
- args_list += ["--output_dir", "result"]
59
- args_list += ["--log_level", "debug"]
60
-
61
- os.environ["WORK_DIR"] = "./"
62
- inference.main(args_list)
63
-
64
- ### Display ###
65
- result_file = os.path.join(
66
- "result/{}/{}_{}.wav".format(audio_name, audio_name, target_singer)
67
- )
68
- return result_file
69
-
70
-
71
- with gr.Blocks() as demo:
72
- gr.Markdown(
73
- """
74
- # Amphion Singing Voice Conversion: *DiffWaveNetSVC*
75
-
76
- [![arXiv](https://img.shields.io/badge/arXiv-Paper-<COLOR>.svg)](https://arxiv.org/abs/2310.11160)
77
-
78
- This demo provides an Amphion [DiffWaveNetSVC](https://github.com/open-mmlab/Amphion/tree/main/egs/svc/MultipleContentsSVC) pretrained model for you to play. The training data has been detailed [here](https://huggingface.co/amphion/singing_voice_conversion).
79
- """
80
- )
81
-
82
- gr.Markdown(
83
- """
84
- ## Source Audio
85
- **Hint**: We recommend using dry vocals (e.g., studio recordings or source-separated voices from music) as the input for this demo. At the bottom of this page, we provide some examples for your reference.
86
- """
87
- )
88
- source_audio_input = gr.Audio(
89
- sources=["upload", "microphone"],
90
- label="Source Audio",
91
- type="filepath",
92
  )
93
-
94
- with gr.Row():
95
- with gr.Column():
96
- config_target_singer = gr.Radio(
97
- choices=list(SUPPORTED_TARGET_SINGERS.keys()),
98
- label="Target Singer",
99
- value="Jian Li ๆŽๅฅ",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  )
101
- config_keyshift_choice = gr.Radio(
102
- choices=["Auto Shift", "Key Shift"],
103
- value="Auto Shift",
104
- label="Pitch Shift Control",
105
- info='If you want to control the specific pitch shift value, you need to choose "Key Shift"',
 
106
  )
107
-
108
- # gr.Markdown("## Conversion Configurations")
109
- with gr.Column():
110
- config_keyshift_value = gr.Slider(
111
- -6,
112
- 6,
113
- value=0,
114
- step=1,
115
- label="Key Shift Values",
116
- info='How many semitones you want to transpose. This parameter will work only if you choose "Key Shift"',
117
  )
118
- config_diff_infer_steps = gr.Slider(
119
- 1,
120
- 1000,
121
- value=1000,
122
  step=1,
123
- label="Diffusion Inference Steps",
124
- info="As the step number increases, the synthesis quality will be better while the inference speed will be lower",
125
- )
126
- btn = gr.ClearButton(
127
- components=[
128
- config_target_singer,
129
- config_keyshift_choice,
130
- config_keyshift_value,
131
- config_diff_infer_steps,
132
- ]
133
  )
134
- btn = gr.Button(value="Submit", variant="primary")
135
-
136
- gr.Markdown("## Conversion Result")
137
- demo_outputs = gr.Audio(label="Conversion Result")
138
-
139
- btn.click(
140
- fn=svc_inference,
141
- inputs=[
142
- source_audio_input,
143
- config_target_singer,
144
- config_keyshift_choice,
145
- config_keyshift_value,
146
- config_diff_infer_steps,
147
- ],
148
- outputs=demo_outputs,
149
- )
150
-
151
- gr.Markdown("## Examples")
152
  gr.Examples(
153
- examples=[
154
- [
155
- "examples/chinese_female_recordings.wav",
156
- "John Mayer",
157
- "Auto Shift",
158
- 1000,
159
- "examples/output/chinese_female_recordings_vocalist_l1_JohnMayer.wav",
160
- ],
161
- [
162
- "examples/chinese_male_seperated.wav",
163
- "Taylor Swift",
164
- "Auto Shift",
165
- 1000,
166
- "examples/output/chinese_male_seperated_vocalist_l1_TaylorSwift.wav",
167
- ],
168
- [
169
- "examples/english_female_seperated.wav",
170
- "Feng Wang ๆฑชๅณฐ",
171
- "Auto Shift",
172
- 1000,
173
- "examples/output/english_female_seperated_vocalist_l1_ๆฑชๅณฐ.wav",
174
- ],
175
- [
176
- "examples/english_male_recordings.wav",
177
- "Yijie Shi ็Ÿณๅ€šๆด",
178
- "Auto Shift",
179
- 1000,
180
- "examples/output/english_male_recordings_vocalist_l1_็Ÿณๅ€šๆด.wav",
181
- ],
182
- ],
183
- inputs=[
184
- source_audio_input,
185
- config_target_singer,
186
- config_keyshift_choice,
187
- config_diff_infer_steps,
188
- demo_outputs,
189
- ],
190
  )
191
 
192
-
193
- if __name__ == "__main__":
194
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
+ from diffusers import StableDiffusionXLPipeline, AutoencoderKL, KDPM2AncestralDiscreteScheduler
4
+ from huggingface_hub import hf_hub_download
5
+ import spaces
6
+ from PIL import Image
7
+ import requests
8
+ from translatepy import Translator
9
+
10
+ translator = Translator()
11
+
12
+ # Constants
13
+ model = "Corcelio/mobius"
14
+ vae_model = "madebyollin/sdxl-vae-fp16-fix"
15
+
16
+ CSS = """
17
+ .gradio-container {
18
+ max-width: 690px !important;
 
 
19
  }
20
+ footer {
21
+ visibility: hidden;
22
+ }
23
+ """
24
+
25
+ JS = """function () {
26
+ gradioURL = window.location.href
27
+ if (!gradioURL.endsWith('?__theme=dark')) {
28
+ window.location.replace(gradioURL + '?__theme=dark');
29
+ }
30
+ }"""
31
+
32
+ # Load VAE component
33
+ vae = AutoencoderKL.from_pretrained(
34
+ vae_model,
35
+ torch_dtype=torch.float16
36
+ )
37
+
38
+ # Ensure model and scheduler are initialized in GPU-enabled function
39
+ if torch.cuda.is_available():
40
+ pipe = StableDiffusionXLPipeline.from_pretrained(model, vae=vae, torch_dtype=torch.float16).to("cuda")
41
+
42
+ pipe.scheduler = KDPM2AncestralDiscreteScheduler.from_config(pipe.scheduler.config)
43
+
44
+
45
+ # Function
46
+ @spaces.GPU()
47
+ def generate_image(
48
+ prompt,
49
+ negative="low quality",
50
+ width=1024,
51
+ height=1024,
52
+ scale=1.5,
53
+ steps=30,
54
+ clip=3):
55
+
56
+ prompt = str(translator.translate(prompt, 'English'))
57
+
58
+ print(f'prompt:{prompt}')
59
+
60
+ image = pipe(
61
+ prompt,
62
+ negative_prompt=negative,
63
+ width=width,
64
+ height=height,
65
+ guidance_scale=scale,
66
+ num_inference_steps=steps,
67
+ clip_skip=clip,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  )
69
+ return image.images[0]
70
+
71
+
72
+ examples = [
73
+ "์•„๋ฆ„๋‹ค์šด 20์„ธ ํ•œ๊ตญ ์—ฌ์ž ๋ชจ๋ธ, 'ํ•œ๊ตญ ์—ฌ์ž๊ฐ€์ˆ˜ ์•„์ด์œ  ๋‹ฎ์€ ์–ผ๊ตด', ๊ฒ€์€์ƒ‰ ์งง์€ ๋‹จ๋ฐœ๋จธ๋ฆฌ, C์ปต ์‚ฌ์ด์ฆˆ์˜ ํฐ ๊ฐ€์Šด, ํฐ ๊ณจ๋ฐ˜, ๊ฐ€์ˆ˜ ์œ ๋‹ˆํผ, ๋ฐฐ๊ฒฝ ํฐ์ƒ‰, ์Šค๋งˆ์ผ ํ‘œ์ •, ๋ชจ๋ธ ํฌ์ฆˆ, ์ •๋ฉด ์‘์‹œ, ์ „์‹  ๋…ธ์ถœ, ์ดˆ๊ณ ํ•ด์ƒ๋„ ์‚ฌ์ง„",
74
+ "์•„๋ฆ„๋‹ค์šด 20์„ธ ์˜๊ตญ ์—ฌ์ž ๋ชจ๋ธ, '์— ๋งˆ์™“์Šจ ๋‹ฎ์€ ์–ผ๊ตด', ๊ธˆ๋ฐœ ์งง์€ ๋‹จ๋ฐœ๋จธ๋ฆฌ, ์ด๋ธŒ๋‹ ๋“œ๋ ˆ์Šค, ๋ฐฐ๊ฒฝ ์‹œ์ƒ์‹, ์Šค๋งˆ์ผ ํ‘œ์ •, ๋ชจ๋ธ ํฌ์ฆˆ, ์ •๋ฉด ์‘์‹œ, ์ „์‹  ๋…ธ์ถœ, ์ดˆ๊ณ ํ•ด์ƒ๋„ ์‚ฌ์ง„",
75
+ "์•„๋ฆ„๋‹ค์šด 20์„ธ ํ•œ๊ตญ ์—ฌ์ž ๋ชจ๋ธ, 'ํ•œ๊ตญ ์—ฌ์ž ์•„์ด๋Œ ๋‹ฎ์€ ์–ผ๊ตด', ๊ฒ€์€์ƒ‰ ์งง์€ ๋‹จ๋ฐœ๋จธ๋ฆฌ, ๋น„ํ‚ค๋‹ˆ ์ˆ˜์˜๋ณต, ๋ฐฐ๊ฒฝ ์ˆ˜์˜์žฅ, ์Šค๋งˆ์ผ ํ‘œ์ •, ๋ชจ๋ธ ํฌ์ฆˆ, ์ •๋ฉด ์‘์‹œ, ์ „์‹  ๋…ธ์ถœ, ์ดˆ๊ณ ํ•ด์ƒ๋„ ์‚ฌ์ง„",
76
+ "์•„๋ฆ„๋‹ค์šด 23์„ธ ์ค‘๊ตญ๊ตญ ์—ฌ์ž ๋ชจ๋ธ, ๊ฐˆ์ƒ‰ ๊ธด ์ƒ๋จธ๋ฆฌ, C์ปต ์‚ฌ์ด์ฆˆ์˜ ํฐ ๊ฐ€์Šด, ๋ฐฐ๊ฒฝ ์ŠคํŠœ๋””์˜ค, ์ง„์ง€ํ•œ ํ‘œ์ •, ์˜คํ”ผ์Šค ์œ ๋‹ˆํผ, ๋ชจ๋ธ ํฌ์ฆˆ, ์ •๋ฉด ์‘์‹œ, ์ดˆ๊ณ ํ•ด์ƒ๋„ ์‚ฌ์ง„",
77
+ "์•„๋ฆ„๋‹ค์šด 18์„ธ ์ผ๋ณธ ์—ฌ์ž ๋ชจ๋ธ, ๊ฒ€์€์ƒ‰ ์งง์€ ๋‹จ๋ฐœ๋จธ๋ฆฌ, ์Šค๋งˆ์ผ ํ‘œ์ •, ๊ต๋ณต ์œ ๋‹ˆํผ, ๋ฐฐ๊ฒฝ ํ•™๊ต ๊ต์‹ค, ๋ชจ๋ธ ํฌ์ฆˆ, ์ •๋ฉด ์‘์‹œ, ์ดˆ๊ณ ํ•ด์ƒ๋„ ์‚ฌ์ง„",
78
+ "์•„๋ฆ„๋‹ค์šด 20์„ธ ๋ธŒ๋ผ์งˆ ์—ฌ์ž ๋ชจ๋ธ, ๊ฒ€์€์ƒ‰ ์งง์€ ๋‹จ๋ฐœ๋จธ๋ฆฌ, C์ปต ์‚ฌ์ด์ฆˆ์˜ ํฐ ๊ฐ€์Šด, ํฐ ๊ณจ๋ฐ˜, ๊ฐ„ํ˜ธ์‚ฌ ์œ ๋‹ˆํผ, ๋ฐฐ๊ฒฝ ํฐ์ƒ‰, ์Šค๋งˆ์ผ ํ‘œ์ •, ๋ชจ๋ธ ํฌ์ฆˆ, ์ •๋ฉด ์‘์‹œ, ์ดˆ๊ณ ํ•ด์ƒ๋„ ์‚ฌ์ง„",
79
+ "์•„๋ฆ„๋‹ค์šด 20์„ธ ์Šค์›จ๋ด ์—ฌ์ž ๋ชจ๋ธ, ๊ธˆ๋ฐœ ๊ธด ์ƒ๋จธ๋ฆฌ, C์ปต ์‚ฌ์ด์ฆˆ์˜ ํฐ ๊ฐ€์Šด, ํฐ ๊ณจ๋ฐ˜, ๋น„ํ‚ค๋‹ˆ ์ˆ˜์˜๋ณต, ๋ฐฐ๊ฒฝ ํ•ด๋ณ€๊ฐ€, ์Šค๋งˆ์ผ ํ‘œ์ •, ๋ชจ๋ธ ํฌ์ฆˆ, ์ •๋ฉด ์‘์‹œ, ์ดˆ๊ณ ํ•ด์ƒ๋„ ์‚ฌ์ง„",
80
+ "์•„๋ฆ„๋‹ค์šด 18์„ธ ๋Ÿฌ์‹œ์•„ ์—ฌ์ž ๋ชจ๋ธ, ๊ธˆ๋ฐœ ์งง์€ ๋‹จ๋ฐœ๋จธ๋ฆฌ, C์ปต ์‚ฌ์ด์ฆˆ์˜ ํฐ ๊ฐ€์Šด, ํฐ ๊ณจ๋ฐ˜, ๋น„ํ‚ค๋‹ˆ ์ˆ˜์˜๋ณต, ๋ฐฐ๊ฒฝ ์ˆ˜์˜์žฅ, ์—„์ˆ™ํ•œ ํ‘œ์ •, ๋ชจ๋ธ ํฌ์ฆˆ, ์ •๋ฉด ์‘์‹œ, ์ดˆ๊ณ ํ•ด์ƒ๋„ ์‚ฌ์ง„",
81
+ "์•„๋ฆ„๋‹ค์šด 20์„ธ ํ”„๋ž‘์Šค ์—ฌ์ž ๋ชจ๋ธ, ๊ฐˆ์ƒ‰ ์งง์€ ๋‹จ๋ฐœ๋จธ๋ฆฌ, C์ปต ์‚ฌ์ด์ฆˆ์˜ ํฐ ๊ฐ€์Šด, ํฐ ๊ณจ๋ฐ˜, ๋น„์ฆˆ๋‹ˆ์Šค ์ •์žฅ, ๋ฐฐ๊ฒฝ ์‚ฌ๋ฌด์‹ค, ํฌ๊ฒŒ ์›ƒ๋Š” ํ‘œ์ •, ๋ชจ๋ธ ํฌ์ฆˆ, ์ •๋ฉด ์‘์‹œ, ์ดˆ๊ณ ํ•ด์ƒ๋„ ์‚ฌ์ง„",
82
+ "์•„๋ฆ„๋‹ค์šด 16์„ธ ์šฐํฌ๋ผ์ด๋‚˜ ์—ฌ์ž ๋ชจ๋ธ, ๊ฐˆ์ƒ‰ ๊ธด ์ƒ๋จธ๋ฆฌ, C์ปต ์‚ฌ์ด์ฆˆ์˜ ํฐ ๊ฐ€์Šด, ํฐ ๊ณจ๋ฐ˜, ์˜คํ”ผ์Šค ์œ ๋‹ˆํผ, ์„น์Šค ํฌ์ฆˆ, ๋ฐฐ๊ฒฝ ํ˜ธํ…”, ํ–‰๋ณตํ•œ ํ‘œ์ •, ์ •๋ฉด ์‘์‹œ, ์ดˆ๊ณ ํ•ด์ƒ๋„ ์‚ฌ์ง„"
83
+ ]
84
+
85
+
86
+ # Gradio Interface
87
+
88
+ with gr.Blocks(css=CSS, js=JS, theme="soft") as demo:
89
+ gr.HTML("<h1><center>๋‚˜๋งŒ์˜ ๋ชจ๋ธ ์บ๋ฆญํ„ฐ ์ƒ์„ฑ</center></h1>")
90
+ with gr.Group():
91
+ with gr.Row():
92
+ prompt = gr.Textbox(label='Enter Your Prompt', value="best quality, HD, aesthetic", scale=6)
93
+ submit = gr.Button(scale=1, variant='primary')
94
+ img = gr.Image(label='Generated Image')
95
+ with gr.Accordion("Advanced Options", open=False):
96
+ with gr.Row():
97
+ negative = gr.Textbox(label="Negative prompt", value="low quality, low quality, (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, (mutated hands and fingers:1.4), disconnected limbs, mutation, mutated, ugly, disgusting, blurry, amputation, (NSFW:1.25)")
98
+ with gr.Row():
99
+ width = gr.Slider(
100
+ label="Width",
101
+ minimum=512,
102
+ maximum=1280,
103
+ step=8,
104
+ value=1024,
105
  )
106
+ height = gr.Slider(
107
+ label="Height",
108
+ minimum=512,
109
+ maximum=1280,
110
+ step=8,
111
+ value=1024,
112
  )
113
+ with gr.Row():
114
+ scale = gr.Slider(
115
+ label="Guidance",
116
+ minimum=3.5,
117
+ maximum=7,
118
+ step=0.1,
119
+ value=7,
 
 
 
120
  )
121
+ steps = gr.Slider(
122
+ label="Steps",
123
+ minimum=1,
124
+ maximum=50,
125
  step=1,
126
+ value=50,
 
 
 
 
 
 
 
 
 
127
  )
128
+ clip = gr.Slider(
129
+ label="Clip Skip",
130
+ minimum=1,
131
+ maximum=10,
132
+ step=1,
133
+ value=3,
134
+ )
 
 
 
 
 
 
 
 
 
 
 
135
  gr.Examples(
136
+ examples=examples,
137
+ inputs=prompt,
138
+ outputs=img,
139
+ fn=generate_image,
140
+ cache_examples="lazy",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  )
142
 
143
+ prompt.submit(fn=generate_image,
144
+ inputs=[prompt, negative, width, height, scale, steps, clip],
145
+ outputs=img,
146
+ )
147
+ submit.click(fn=generate_image,
148
+ inputs=[prompt, negative, width, height, scale, steps, clip],
149
+ outputs=img,
150
+ )
151
+
152
+ #demo.queue().launch()
153
+ demo.queue().launch(auth=("gini", "pick"))