Spaces:
Running
Running
update
Browse files- examples/batch_audio_fmt_convert.py +6 -2
- examples/clone/voice_clone.py +58 -0
- examples/concat/batch_concat_three_and_adapt_volume.py +54 -0
- examples/concat/concat_three_and_adapt_volume.py +85 -0
- examples/concat/test1.py +0 -48
- examples/concat/test2.py +0 -51
- main.py +2 -2
- toolbox/audio_edit/convert.py +4 -0
examples/batch_audio_fmt_convert.py
CHANGED
@@ -5,6 +5,8 @@ import argparse
|
|
5 |
import librosa
|
6 |
import numpy as np
|
7 |
from pathlib import Path
|
|
|
|
|
8 |
from scipy.io import wavfile
|
9 |
from tqdm import tqdm
|
10 |
|
@@ -15,12 +17,14 @@ def get_args():
|
|
15 |
parser = argparse.ArgumentParser()
|
16 |
parser.add_argument(
|
17 |
"--audio_dir",
|
18 |
-
default=(project_path / "data/yd").as_posix(),
|
|
|
19 |
type=str,
|
20 |
)
|
21 |
parser.add_argument(
|
22 |
"--output_dir",
|
23 |
-
default=(project_path / "data/temp_wav").as_posix(),
|
|
|
24 |
type=str,
|
25 |
)
|
26 |
args = parser.parse_args()
|
|
|
5 |
import librosa
|
6 |
import numpy as np
|
7 |
from pathlib import Path
|
8 |
+
|
9 |
+
from fsspec.registry import default
|
10 |
from scipy.io import wavfile
|
11 |
from tqdm import tqdm
|
12 |
|
|
|
17 |
parser = argparse.ArgumentParser()
|
18 |
parser.add_argument(
|
19 |
"--audio_dir",
|
20 |
+
# default=(project_path / "data/yd").as_posix(),
|
21 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾",
|
22 |
type=str,
|
23 |
)
|
24 |
parser.add_argument(
|
25 |
"--output_dir",
|
26 |
+
# default=(project_path / "data/temp_wav").as_posix(),
|
27 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾",
|
28 |
type=str,
|
29 |
)
|
30 |
args = parser.parse_args()
|
examples/clone/voice_clone.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
import shutil
|
5 |
+
|
6 |
+
from gradio_client import Client, handle_file
|
7 |
+
|
8 |
+
|
9 |
+
def get_args():
|
10 |
+
parser = argparse.ArgumentParser()
|
11 |
+
parser.add_argument(
|
12 |
+
"--text",
|
13 |
+
default="吴家豪",
|
14 |
+
type=str,
|
15 |
+
)
|
16 |
+
parser.add_argument(
|
17 |
+
"--reference",
|
18 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\Chinese_3.wav",
|
19 |
+
type=str,
|
20 |
+
)
|
21 |
+
parser.add_argument(
|
22 |
+
"--output_file",
|
23 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2.wav",
|
24 |
+
type=str,
|
25 |
+
)
|
26 |
+
args = parser.parse_args()
|
27 |
+
return args
|
28 |
+
|
29 |
+
|
30 |
+
def main():
|
31 |
+
args = get_args()
|
32 |
+
|
33 |
+
client = Client("https://coqui-xtts.hf.space/--replicas/fib73/")
|
34 |
+
|
35 |
+
_, synthesised_audio, _, _ = client.predict(
|
36 |
+
args.text,
|
37 |
+
# "en",
|
38 |
+
# "es",
|
39 |
+
# "ja",
|
40 |
+
# "pt",
|
41 |
+
# "ko",
|
42 |
+
# "ar",
|
43 |
+
"zh-cn",
|
44 |
+
args.reference,
|
45 |
+
args.reference,
|
46 |
+
False, False, True, True,
|
47 |
+
fn_index=1
|
48 |
+
)
|
49 |
+
|
50 |
+
shutil.move(
|
51 |
+
synthesised_audio,
|
52 |
+
args.output_file
|
53 |
+
)
|
54 |
+
return
|
55 |
+
|
56 |
+
|
57 |
+
if __name__ == '__main__':
|
58 |
+
main()
|
examples/concat/batch_concat_three_and_adapt_volume.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
from gradio_client import Client, handle_file
|
6 |
+
import numpy as np
|
7 |
+
from scipy.io import wavfile
|
8 |
+
|
9 |
+
|
10 |
+
def get_args():
|
11 |
+
parser = argparse.ArgumentParser()
|
12 |
+
parser.add_argument(
|
13 |
+
"--filename1",
|
14 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\英语\English_1.wav",
|
15 |
+
type=str,
|
16 |
+
)
|
17 |
+
parser.add_argument(
|
18 |
+
"--filename2",
|
19 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\英语\xtts_v2_english_2.wav",
|
20 |
+
type=str,
|
21 |
+
)
|
22 |
+
parser.add_argument(
|
23 |
+
"--filename3",
|
24 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\英语\English_3.wav",
|
25 |
+
type=str,
|
26 |
+
)
|
27 |
+
args = parser.parse_args()
|
28 |
+
return args
|
29 |
+
|
30 |
+
|
31 |
+
def main():
|
32 |
+
args = get_args()
|
33 |
+
|
34 |
+
client = Client("http://10.75.27.247:7861/")
|
35 |
+
new_filename2, _ = client.predict(
|
36 |
+
audio_t=handle_file(args.filename2),
|
37 |
+
radio=1,
|
38 |
+
decibel=0,
|
39 |
+
reference=handle_file(args.filename3),
|
40 |
+
engine="by_ffmpy_by_db",
|
41 |
+
api_name="/when_click_change_volume"
|
42 |
+
)
|
43 |
+
|
44 |
+
_, signal1 = wavfile.read(args.filename1)
|
45 |
+
_, signal2 = wavfile.read(new_filename2)
|
46 |
+
_, signal3 = wavfile.read(args.filename3)
|
47 |
+
|
48 |
+
signal = np.concat([signal1, signal2, signal3], axis=0)
|
49 |
+
print(signal.shape)
|
50 |
+
return
|
51 |
+
|
52 |
+
|
53 |
+
if __name__ == '__main__':
|
54 |
+
main()
|
examples/concat/concat_three_and_adapt_volume.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
# -*- coding: utf-8 -*-
|
3 |
+
import argparse
|
4 |
+
import shutil
|
5 |
+
|
6 |
+
from gradio_client import Client, handle_file
|
7 |
+
import numpy as np
|
8 |
+
from scipy.io import wavfile
|
9 |
+
|
10 |
+
|
11 |
+
def get_args():
|
12 |
+
parser = argparse.ArgumentParser()
|
13 |
+
parser.add_argument(
|
14 |
+
"--filename1",
|
15 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\Chinese_1.wav",
|
16 |
+
type=str,
|
17 |
+
)
|
18 |
+
parser.add_argument(
|
19 |
+
"--filename2",
|
20 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2.wav",
|
21 |
+
type=str,
|
22 |
+
)
|
23 |
+
parser.add_argument(
|
24 |
+
"--filename3",
|
25 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\Chinese_3.wav",
|
26 |
+
type=str,
|
27 |
+
)
|
28 |
+
parser.add_argument(
|
29 |
+
"--output_adapt_file",
|
30 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2_volume_adapt.wav",
|
31 |
+
type=str,
|
32 |
+
)
|
33 |
+
parser.add_argument(
|
34 |
+
"--output_concat_file",
|
35 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2_concat.wav",
|
36 |
+
type=str,
|
37 |
+
)
|
38 |
+
args = parser.parse_args()
|
39 |
+
return args
|
40 |
+
|
41 |
+
|
42 |
+
def main():
|
43 |
+
args = get_args()
|
44 |
+
|
45 |
+
# client = Client("http://10.75.27.247:7861/")
|
46 |
+
client = Client("http://127.0.0.1:7861/")
|
47 |
+
|
48 |
+
new_filename2, _ = client.predict(
|
49 |
+
audio_t=handle_file(args.filename2),
|
50 |
+
radio=1,
|
51 |
+
decibel=0,
|
52 |
+
reference=handle_file(args.filename3),
|
53 |
+
engine="by_pydub_by_reference",
|
54 |
+
api_name="/when_click_change_volume"
|
55 |
+
)
|
56 |
+
|
57 |
+
new_filename2, _, _, _ = client.predict(
|
58 |
+
audio_t=handle_file(new_filename2),
|
59 |
+
to_sample_rate=8000,
|
60 |
+
sample_width=2,
|
61 |
+
channels="0",
|
62 |
+
engine="librosa",
|
63 |
+
api_name="/when_click_audio_convert"
|
64 |
+
)
|
65 |
+
|
66 |
+
_, signal1 = wavfile.read(args.filename1)
|
67 |
+
_, signal2 = wavfile.read(new_filename2)
|
68 |
+
_, signal3 = wavfile.read(args.filename3)
|
69 |
+
|
70 |
+
signal = np.concat([signal1, signal2, signal3], axis=0)
|
71 |
+
|
72 |
+
shutil.move(
|
73 |
+
new_filename2,
|
74 |
+
args.output_adapt_file
|
75 |
+
)
|
76 |
+
wavfile.write(
|
77 |
+
args.output_concat_file,
|
78 |
+
8000,
|
79 |
+
signal,
|
80 |
+
)
|
81 |
+
return
|
82 |
+
|
83 |
+
|
84 |
+
if __name__ == '__main__':
|
85 |
+
main()
|
examples/concat/test1.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
#!/usr/bin/python3
|
2 |
-
# -*- coding: utf-8 -*-
|
3 |
-
import argparse
|
4 |
-
|
5 |
-
import os
|
6 |
-
from ffmpy import FFmpeg
|
7 |
-
|
8 |
-
|
9 |
-
def get_args():
|
10 |
-
parser = argparse.ArgumentParser()
|
11 |
-
parser.add_argument("--change_by_db", default=-11, type=int)
|
12 |
-
args = parser.parse_args()
|
13 |
-
return args
|
14 |
-
|
15 |
-
|
16 |
-
def change_by_decibel(audio_path: str, output_file: str, decibel):
|
17 |
-
ext = os.path.basename(audio_path).strip().split(".")[-1]
|
18 |
-
if ext not in ["wav", "mp3"]:
|
19 |
-
raise Exception("format error")
|
20 |
-
if os.path.exists(output_file):
|
21 |
-
os.remove(output_file)
|
22 |
-
ff = FFmpeg(
|
23 |
-
inputs={audio_path: None},
|
24 |
-
outputs={output_file: f'-filter:a "volume={decibel}dB"'}
|
25 |
-
)
|
26 |
-
ff.run()
|
27 |
-
return output_file
|
28 |
-
|
29 |
-
|
30 |
-
def main():
|
31 |
-
args = get_args()
|
32 |
-
|
33 |
-
for i in range(10):
|
34 |
-
filename = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2.wav".format(i)
|
35 |
-
output_file = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2_volume.wav".format(i)
|
36 |
-
|
37 |
-
output_file = change_by_decibel(
|
38 |
-
filename,
|
39 |
-
output_file,
|
40 |
-
args.change_by_db,
|
41 |
-
)
|
42 |
-
# print(f"output_file: {output_file}")
|
43 |
-
|
44 |
-
return
|
45 |
-
|
46 |
-
|
47 |
-
if __name__ == "__main__":
|
48 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
examples/concat/test2.py
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
#!/usr/bin/python3
|
2 |
-
# -*- coding: utf-8 -*-
|
3 |
-
import librosa
|
4 |
-
import numpy as np
|
5 |
-
from scipy.io import wavfile
|
6 |
-
|
7 |
-
|
8 |
-
for i in range(10):
|
9 |
-
filename1 = r"C:\Users\tianx\Desktop\Audio\x_tts_v2\audio_0_section_1.wav"
|
10 |
-
filename2 = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2_volume.wav".format(i)
|
11 |
-
filename3 = r"C:\Users\tianx\Desktop\Audio\x_tts_v2\audio_0_section_2.wav"
|
12 |
-
|
13 |
-
output_filename = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2_concat_volume.wav".format(i)
|
14 |
-
|
15 |
-
signal1, sample_rate = librosa.load(filename1, sr=8000)
|
16 |
-
|
17 |
-
print(sample_rate)
|
18 |
-
print(signal1.dtype)
|
19 |
-
print(signal1.shape)
|
20 |
-
|
21 |
-
signal2, sample_rate = librosa.load(filename2, sr=8000)
|
22 |
-
|
23 |
-
print(sample_rate)
|
24 |
-
print(signal2.dtype)
|
25 |
-
print(signal2.shape)
|
26 |
-
|
27 |
-
signal3, sample_rate = librosa.load(filename3, sr=8000)
|
28 |
-
|
29 |
-
print(sample_rate)
|
30 |
-
print(signal3.dtype)
|
31 |
-
print(signal3.shape)
|
32 |
-
|
33 |
-
signal = np.concatenate([signal1, signal2, signal3], dtype=np.float32)
|
34 |
-
print(signal.dtype)
|
35 |
-
print(signal.shape)
|
36 |
-
|
37 |
-
max_wave_value = 32768.0
|
38 |
-
signal *= max_wave_value
|
39 |
-
signal = np.array(signal, dtype=np.int16)
|
40 |
-
print(signal.dtype)
|
41 |
-
print(sample_rate)
|
42 |
-
|
43 |
-
wavfile.write(
|
44 |
-
output_filename,
|
45 |
-
8000,
|
46 |
-
signal,
|
47 |
-
)
|
48 |
-
|
49 |
-
|
50 |
-
if __name__ == '__main__':
|
51 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1 |
#!/usr/bin/python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
"""
|
4 |
-
docker build -t audio_edit:
|
5 |
|
6 |
docker run -itd \
|
7 |
--name audio_edit_7861 \
|
8 |
--restart=always \
|
9 |
--network host \
|
10 |
-e port=7861 \
|
11 |
-
audio_edit:
|
12 |
"""
|
13 |
import argparse
|
14 |
import json
|
|
|
1 |
#!/usr/bin/python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
"""
|
4 |
+
docker build -t audio_edit:v20250116_1917 .
|
5 |
|
6 |
docker run -itd \
|
7 |
--name audio_edit_7861 \
|
8 |
--restart=always \
|
9 |
--network host \
|
10 |
-e port=7861 \
|
11 |
+
audio_edit:v20250116_1917
|
12 |
"""
|
13 |
import argparse
|
14 |
import json
|
toolbox/audio_edit/convert.py
CHANGED
@@ -55,6 +55,10 @@ def audio_convert_by_librosa(filename: str,
|
|
55 |
signal = np.concatenate(signal_, axis=-1)
|
56 |
|
57 |
if sample_width == 2:
|
|
|
|
|
|
|
|
|
58 |
max_wave_value = 32768.0
|
59 |
signal *= max_wave_value
|
60 |
signal = np.array(signal, dtype=np.int16)
|
|
|
55 |
signal = np.concatenate(signal_, axis=-1)
|
56 |
|
57 |
if sample_width == 2:
|
58 |
+
scale = np.max([np.abs(np.max(signal)), np.abs(np.min(signal))])
|
59 |
+
if scale > 1:
|
60 |
+
signal /= scale
|
61 |
+
|
62 |
max_wave_value = 32768.0
|
63 |
signal *= max_wave_value
|
64 |
signal = np.array(signal, dtype=np.int16)
|