Spaces:
Running
Running
update
Browse files- .gitignore +1 -0
- Dockerfile +3 -1
- examples/batch_audio_fmt_convert.py +2 -2
- examples/clone/voice_clone.py +26 -6
- examples/concat/concat_three_and_adapt_volume.py +21 -5
- install.sh +62 -0
- main.py +15 -0
- toolbox/audio_edit/reverb.py +63 -3
.gitignore
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
.idea/
|
4 |
|
5 |
#/data/
|
|
|
6 |
/dotenv/
|
7 |
/logs/
|
8 |
/trained_models
|
|
|
3 |
.idea/
|
4 |
|
5 |
#/data/
|
6 |
+
/data/impulse_responses
|
7 |
/dotenv/
|
8 |
/logs/
|
9 |
/trained_models
|
Dockerfile
CHANGED
@@ -5,11 +5,13 @@ WORKDIR /code
|
|
5 |
COPY . /code
|
6 |
|
7 |
RUN apt-get update
|
8 |
-
RUN apt-get install -y ffmpeg build-essential
|
9 |
|
10 |
RUN pip install --upgrade pip
|
11 |
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
12 |
|
|
|
|
|
13 |
RUN useradd -m -u 1000 user
|
14 |
|
15 |
USER user
|
|
|
5 |
COPY . /code
|
6 |
|
7 |
RUN apt-get update
|
8 |
+
RUN apt-get install -y wget ffmpeg build-essential
|
9 |
|
10 |
RUN pip install --upgrade pip
|
11 |
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
12 |
|
13 |
+
RUN bash install.sh --stage 1 --stop_stage 1 --system_version ubuntu
|
14 |
+
|
15 |
RUN useradd -m -u 1000 user
|
16 |
|
17 |
USER user
|
examples/batch_audio_fmt_convert.py
CHANGED
@@ -18,13 +18,13 @@ def get_args():
|
|
18 |
parser.add_argument(
|
19 |
"--audio_dir",
|
20 |
# default=(project_path / "data/yd").as_posix(),
|
21 |
-
default=r"E:\牛信文档\语音克隆\多语种语音克隆\
|
22 |
type=str,
|
23 |
)
|
24 |
parser.add_argument(
|
25 |
"--output_dir",
|
26 |
# default=(project_path / "data/temp_wav").as_posix(),
|
27 |
-
default=r"E:\牛信文档\语音克隆\多语种语音克隆\
|
28 |
type=str,
|
29 |
)
|
30 |
args = parser.parse_args()
|
|
|
18 |
parser.add_argument(
|
19 |
"--audio_dir",
|
20 |
# default=(project_path / "data/yd").as_posix(),
|
21 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\money_char",
|
22 |
type=str,
|
23 |
)
|
24 |
parser.add_argument(
|
25 |
"--output_dir",
|
26 |
# default=(project_path / "data/temp_wav").as_posix(),
|
27 |
+
default=r"E:\牛信文档\语音克隆\多语种语音克隆\money_char",
|
28 |
type=str,
|
29 |
)
|
30 |
args = parser.parse_args()
|
examples/clone/voice_clone.py
CHANGED
@@ -6,21 +6,41 @@ import shutil
|
|
6 |
from gradio_client import Client, handle_file
|
7 |
|
8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
def get_args():
|
10 |
parser = argparse.ArgumentParser()
|
11 |
parser.add_argument(
|
12 |
"--text",
|
13 |
-
default="
|
|
|
|
|
|
|
|
|
14 |
type=str,
|
15 |
)
|
16 |
parser.add_argument(
|
17 |
"--reference",
|
18 |
-
default=
|
19 |
type=str,
|
20 |
)
|
21 |
parser.add_argument(
|
22 |
"--output_file",
|
23 |
-
default=
|
24 |
type=str,
|
25 |
)
|
26 |
args = parser.parse_args()
|
@@ -30,17 +50,17 @@ def get_args():
|
|
30 |
def main():
|
31 |
args = get_args()
|
32 |
|
33 |
-
client = Client("https://coqui-xtts.hf.space/--replicas/
|
34 |
|
35 |
_, synthesised_audio, _, _ = client.predict(
|
36 |
args.text,
|
37 |
# "en",
|
38 |
# "es",
|
39 |
-
|
40 |
# "pt",
|
41 |
# "ko",
|
42 |
# "ar",
|
43 |
-
"zh-cn",
|
44 |
args.reference,
|
45 |
args.reference,
|
46 |
False, False, True, True,
|
|
|
6 |
from gradio_client import Client, handle_file
|
7 |
|
8 |
|
9 |
+
# language1 = "英语"
|
10 |
+
# language2 = "English"
|
11 |
+
# language1 = "西班牙语"
|
12 |
+
# language2 = "Spanish"
|
13 |
+
language1 = "日语"
|
14 |
+
language2 = "Japanese"
|
15 |
+
# language1 = "葡萄牙语"
|
16 |
+
# language2 = "Portuguese"
|
17 |
+
# language1 = "韩语"
|
18 |
+
# language2 = "Korean"
|
19 |
+
# language1 = "阿拉伯语"
|
20 |
+
# language2 = "Arabic"
|
21 |
+
# language1 = "中国台湾"
|
22 |
+
# language2 = "Chinese"
|
23 |
+
|
24 |
+
|
25 |
def get_args():
|
26 |
parser = argparse.ArgumentParser()
|
27 |
parser.add_argument(
|
28 |
"--text",
|
29 |
+
# default="thirty-three dollars and seventy-two cents",
|
30 |
+
# default="treinta y tres euros con setenta y dos céntimos",
|
31 |
+
default="33.72円",
|
32 |
+
# default="33.72 درهم",
|
33 |
+
# default="三十三元七角两分",
|
34 |
type=str,
|
35 |
)
|
36 |
parser.add_argument(
|
37 |
"--reference",
|
38 |
+
default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\{language2}_1.wav",
|
39 |
type=str,
|
40 |
)
|
41 |
parser.add_argument(
|
42 |
"--output_file",
|
43 |
+
default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\xtts_v2_{language2.lower()}_2.wav",
|
44 |
type=str,
|
45 |
)
|
46 |
args = parser.parse_args()
|
|
|
50 |
def main():
|
51 |
args = get_args()
|
52 |
|
53 |
+
client = Client("https://coqui-xtts.hf.space/--replicas/o7bhl/")
|
54 |
|
55 |
_, synthesised_audio, _, _ = client.predict(
|
56 |
args.text,
|
57 |
# "en",
|
58 |
# "es",
|
59 |
+
"ja",
|
60 |
# "pt",
|
61 |
# "ko",
|
62 |
# "ar",
|
63 |
+
# "zh-cn",
|
64 |
args.reference,
|
65 |
args.reference,
|
66 |
False, False, True, True,
|
examples/concat/concat_three_and_adapt_volume.py
CHANGED
@@ -8,31 +8,47 @@ import numpy as np
|
|
8 |
from scipy.io import wavfile
|
9 |
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def get_args():
|
12 |
parser = argparse.ArgumentParser()
|
13 |
parser.add_argument(
|
14 |
"--filename1",
|
15 |
-
default=
|
16 |
type=str,
|
17 |
)
|
18 |
parser.add_argument(
|
19 |
"--filename2",
|
20 |
-
default=
|
21 |
type=str,
|
22 |
)
|
23 |
parser.add_argument(
|
24 |
"--filename3",
|
25 |
-
default=
|
26 |
type=str,
|
27 |
)
|
28 |
parser.add_argument(
|
29 |
"--output_adapt_file",
|
30 |
-
default=
|
31 |
type=str,
|
32 |
)
|
33 |
parser.add_argument(
|
34 |
"--output_concat_file",
|
35 |
-
default=
|
36 |
type=str,
|
37 |
)
|
38 |
args = parser.parse_args()
|
|
|
8 |
from scipy.io import wavfile
|
9 |
|
10 |
|
11 |
+
# language1 = "英语"
|
12 |
+
# language2 = "English"
|
13 |
+
# language1 = "西班牙语"
|
14 |
+
# language2 = "Spanish"
|
15 |
+
language1 = "日语"
|
16 |
+
language2 = "Japanese"
|
17 |
+
# language1 = "葡萄牙语"
|
18 |
+
# language2 = "Portuguese"
|
19 |
+
# language1 = "韩语"
|
20 |
+
# language2 = "Korean"
|
21 |
+
# language1 = "阿拉伯语"
|
22 |
+
# language2 = "Arabic"
|
23 |
+
# language1 = "中国台湾"
|
24 |
+
# language2 = "Chinese"
|
25 |
+
|
26 |
+
|
27 |
def get_args():
|
28 |
parser = argparse.ArgumentParser()
|
29 |
parser.add_argument(
|
30 |
"--filename1",
|
31 |
+
default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\{language2}_1.wav",
|
32 |
type=str,
|
33 |
)
|
34 |
parser.add_argument(
|
35 |
"--filename2",
|
36 |
+
default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\xtts_v2_{language2.lower()}_2.wav",
|
37 |
type=str,
|
38 |
)
|
39 |
parser.add_argument(
|
40 |
"--filename3",
|
41 |
+
default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\{language2}_3.wav",
|
42 |
type=str,
|
43 |
)
|
44 |
parser.add_argument(
|
45 |
"--output_adapt_file",
|
46 |
+
default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\xtts_v2_{language2.lower()}_2_volume_adapt.wav",
|
47 |
type=str,
|
48 |
)
|
49 |
parser.add_argument(
|
50 |
"--output_concat_file",
|
51 |
+
default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\xtts_v2_{language2.lower()}_2_concat.wav",
|
52 |
type=str,
|
53 |
)
|
54 |
args = parser.parse_args()
|
install.sh
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env bash
|
2 |
+
|
3 |
+
# bash install.sh --stage 1 --stop_stage 1 --system_version windows
|
4 |
+
|
5 |
+
|
6 |
+
system_version="centos";
|
7 |
+
verbose=true;
|
8 |
+
stage=-1
|
9 |
+
stop_stage=0
|
10 |
+
|
11 |
+
|
12 |
+
# parse options
|
13 |
+
while true; do
|
14 |
+
[ -z "${1:-}" ] && break; # break if there are no arguments
|
15 |
+
case "$1" in
|
16 |
+
--*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
|
17 |
+
eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
|
18 |
+
old_value="(eval echo \\$$name)";
|
19 |
+
if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
|
20 |
+
was_bool=true;
|
21 |
+
else
|
22 |
+
was_bool=false;
|
23 |
+
fi
|
24 |
+
|
25 |
+
# Set the variable to the right value-- the escaped quotes make it work if
|
26 |
+
# the option had spaces, like --cmd "queue.pl -sync y"
|
27 |
+
eval "${name}=\"$2\"";
|
28 |
+
|
29 |
+
# Check that Boolean-valued arguments are really Boolean.
|
30 |
+
if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
|
31 |
+
echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
|
32 |
+
exit 1;
|
33 |
+
fi
|
34 |
+
shift 2;
|
35 |
+
;;
|
36 |
+
|
37 |
+
*) break;
|
38 |
+
esac
|
39 |
+
done
|
40 |
+
|
41 |
+
work_dir="$(pwd)"
|
42 |
+
data_dir="${work_dir}/data/impulse_responses"
|
43 |
+
|
44 |
+
mkdir -p "${data_dir}"
|
45 |
+
|
46 |
+
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
47 |
+
$verbose && echo "stage 1: download simulated room impulse responses"
|
48 |
+
cd "${data_dir}" || exit 1;
|
49 |
+
|
50 |
+
# https://www.openslr.org/26/
|
51 |
+
wget https://www.openslr.org/resources/26/sim_rir_8k.zip
|
52 |
+
wget https://www.openslr.org/resources/26/sim_rir_16k.zip
|
53 |
+
|
54 |
+
unzip sim_rir_8k.zip
|
55 |
+
unzip sim_rir_16k.zip
|
56 |
+
|
57 |
+
# https://www.openslr.org/28/
|
58 |
+
wget https://www.openslr.org/resources/28/rirs_noises.zip
|
59 |
+
unzip rirs_noises.zip
|
60 |
+
|
61 |
+
fi
|
62 |
+
|
main.py
CHANGED
@@ -29,6 +29,7 @@ from toolbox.audio_edit.speech_speed import change_speech_speed, engine_to_funct
|
|
29 |
from toolbox.audio_edit.volume import change_volume, engine_to_function as volume_engine_to_function
|
30 |
from toolbox.audio_edit.augment import mix_speech_and_noise
|
31 |
from toolbox.audio_edit.reverb import reverb, engine_to_function as reverb_engine_to_function
|
|
|
32 |
|
33 |
|
34 |
def get_args():
|
@@ -42,6 +43,10 @@ def get_args():
|
|
42 |
return args
|
43 |
|
44 |
|
|
|
|
|
|
|
|
|
45 |
def save_input_audio(sample_rate: int, signal: np.ndarray) -> str:
|
46 |
|
47 |
temp_audio_dir = Path(tempfile.gettempdir()) / "input_audio"
|
@@ -548,6 +553,16 @@ def main():
|
|
548 |
mix_output_audio, mix_log
|
549 |
],
|
550 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
551 |
|
552 |
# http://127.0.0.1:7861/
|
553 |
# http://10.75.27.247:7861/
|
|
|
29 |
from toolbox.audio_edit.volume import change_volume, engine_to_function as volume_engine_to_function
|
30 |
from toolbox.audio_edit.augment import mix_speech_and_noise
|
31 |
from toolbox.audio_edit.reverb import reverb, engine_to_function as reverb_engine_to_function
|
32 |
+
from toolbox.os.command import Command
|
33 |
|
34 |
|
35 |
def get_args():
|
|
|
43 |
return args
|
44 |
|
45 |
|
46 |
+
def shell(cmd: str):
|
47 |
+
return Command.popen(cmd)
|
48 |
+
|
49 |
+
|
50 |
def save_input_audio(sample_rate: int, signal: np.ndarray) -> str:
|
51 |
|
52 |
temp_audio_dir = Path(tempfile.gettempdir()) / "input_audio"
|
|
|
553 |
mix_output_audio, mix_log
|
554 |
],
|
555 |
)
|
556 |
+
with gr.TabItem("shell"):
|
557 |
+
shell_text = gr.Textbox(label="cmd")
|
558 |
+
shell_button = gr.Button("run")
|
559 |
+
shell_output = gr.Textbox(label="output")
|
560 |
+
|
561 |
+
shell_button.click(
|
562 |
+
shell,
|
563 |
+
inputs=[shell_text,],
|
564 |
+
outputs=[shell_output],
|
565 |
+
)
|
566 |
|
567 |
# http://127.0.0.1:7861/
|
568 |
# http://10.75.27.247:7861/
|
toolbox/audio_edit/reverb.py
CHANGED
@@ -1,8 +1,13 @@
|
|
1 |
#!/usr/bin/python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
import json
|
|
|
|
|
|
|
|
|
4 |
from typing import List, Tuple
|
5 |
|
|
|
6 |
import numpy as np
|
7 |
import pedalboard
|
8 |
import pyroomacoustics as pra
|
@@ -15,8 +20,9 @@ def reverb_by_pedalboard(signal: np.ndarray,
|
|
15 |
width: float = 1.0,
|
16 |
dry_level: float = 0.4,
|
17 |
wet_level: float = 0.6,
|
18 |
-
freeze_mode: bool = False
|
19 |
-
|
|
|
20 |
|
21 |
board = pedalboard.Pedalboard([
|
22 |
pedalboard.Reverb(
|
@@ -40,7 +46,8 @@ def reverb_by_pyroomacoustics(signal: np.ndarray,
|
|
40 |
source_position: Tuple[float, float] = (2.5, 4.5),
|
41 |
microphone_array: List[Tuple[float, float]] = None,
|
42 |
output_microphone_idx: int = 0,
|
43 |
-
|
|
|
44 |
# signal: float32, (-1, 1)
|
45 |
if microphone_array is None:
|
46 |
microphone_array = [[1.5, 1.5], [2.5, 1.5]]
|
@@ -65,9 +72,62 @@ def reverb_by_pyroomacoustics(signal: np.ndarray,
|
|
65 |
return reverberated_audio
|
66 |
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
engine_to_function = {
|
69 |
"pedalboard": reverb_by_pedalboard,
|
70 |
"pyroomacoustics": reverb_by_pyroomacoustics,
|
|
|
71 |
}
|
72 |
|
73 |
|
|
|
1 |
#!/usr/bin/python3
|
2 |
# -*- coding: utf-8 -*-
|
3 |
import json
|
4 |
+
import os.path
|
5 |
+
import random
|
6 |
+
from functools import lru_cache
|
7 |
+
from pathlib import Path
|
8 |
from typing import List, Tuple
|
9 |
|
10 |
+
import librosa
|
11 |
import numpy as np
|
12 |
import pedalboard
|
13 |
import pyroomacoustics as pra
|
|
|
20 |
width: float = 1.0,
|
21 |
dry_level: float = 0.4,
|
22 |
wet_level: float = 0.6,
|
23 |
+
freeze_mode: bool = False,
|
24 |
+
**kwargs,
|
25 |
+
) -> np.ndarray:
|
26 |
|
27 |
board = pedalboard.Pedalboard([
|
28 |
pedalboard.Reverb(
|
|
|
46 |
source_position: Tuple[float, float] = (2.5, 4.5),
|
47 |
microphone_array: List[Tuple[float, float]] = None,
|
48 |
output_microphone_idx: int = 0,
|
49 |
+
**kwargs,
|
50 |
+
) -> np.ndarray:
|
51 |
# signal: float32, (-1, 1)
|
52 |
if microphone_array is None:
|
53 |
microphone_array = [[1.5, 1.5], [2.5, 1.5]]
|
|
|
72 |
return reverberated_audio
|
73 |
|
74 |
|
75 |
+
def reverb_by_convolve(signal: np.ndarray,
|
76 |
+
impulse_response: np.ndarray,
|
77 |
+
) -> np.ndarray:
|
78 |
+
reverberant_audio = np.convolve(signal, impulse_response, mode="full")
|
79 |
+
reverberant_audio = reverberant_audio[:len(signal)]
|
80 |
+
|
81 |
+
reverberant_audio = reverberant_audio * (np.max(np.abs(signal)) / np.max(np.abs(reverberant_audio)))
|
82 |
+
|
83 |
+
return reverberant_audio
|
84 |
+
|
85 |
+
|
86 |
+
slr28_rir_path = r"E:\programmer\asr_datasets\dns-challenge\datasets.impulse_responses\datasets\impulse_responses\SLR28\RIRS_NOISES\simulated_rirs"
|
87 |
+
slr28_rir_path = Path(slr28_rir_path)
|
88 |
+
|
89 |
+
|
90 |
+
@lru_cache(maxsize=10)
|
91 |
+
def get_slr28_rir_file_list():
|
92 |
+
global slr28_rir_path
|
93 |
+
|
94 |
+
rir_file_list = list()
|
95 |
+
for filename in slr28_rir_path.glob("**/*.wav"):
|
96 |
+
rir_file_list.append(filename)
|
97 |
+
return rir_file_list
|
98 |
+
|
99 |
+
|
100 |
+
def get_rir_file(rir_file: str = None) -> str:
|
101 |
+
if rir_file is None:
|
102 |
+
rir_file_list = get_slr28_rir_file_list()
|
103 |
+
|
104 |
+
rir_file = random.sample(rir_file_list, 1)[0]
|
105 |
+
elif os.path.isfile(rir_file):
|
106 |
+
pass
|
107 |
+
else:
|
108 |
+
rir_file = slr28_rir_path / rir_file
|
109 |
+
rir_file = rir_file.as_posix()
|
110 |
+
return rir_file
|
111 |
+
|
112 |
+
|
113 |
+
def reverb_by_slr28(signal: np.ndarray,
|
114 |
+
sample_rate: int,
|
115 |
+
rir_file: str = None,
|
116 |
+
**kwargs,
|
117 |
+
):
|
118 |
+
rir_file = get_rir_file(rir_file)
|
119 |
+
impulse_response, _ = librosa.load(rir_file, mono=False, sr=sample_rate)
|
120 |
+
if impulse_response.ndim != 1:
|
121 |
+
raise AssertionError
|
122 |
+
|
123 |
+
reverberant_audio = reverb_by_convolve(signal, impulse_response)
|
124 |
+
return reverberant_audio
|
125 |
+
|
126 |
+
|
127 |
engine_to_function = {
|
128 |
"pedalboard": reverb_by_pedalboard,
|
129 |
"pyroomacoustics": reverb_by_pyroomacoustics,
|
130 |
+
"slr28": reverb_by_slr28,
|
131 |
}
|
132 |
|
133 |
|