HoneyTian commited on
Commit
ba051ef
·
1 Parent(s): 17263d1
.gitignore CHANGED
@@ -3,6 +3,7 @@
3
  .idea/
4
 
5
  #/data/
 
6
  /dotenv/
7
  /logs/
8
  /trained_models
 
3
  .idea/
4
 
5
  #/data/
6
+ /data/impulse_responses
7
  /dotenv/
8
  /logs/
9
  /trained_models
Dockerfile CHANGED
@@ -5,11 +5,13 @@ WORKDIR /code
5
  COPY . /code
6
 
7
  RUN apt-get update
8
- RUN apt-get install -y ffmpeg build-essential
9
 
10
  RUN pip install --upgrade pip
11
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
 
 
 
13
  RUN useradd -m -u 1000 user
14
 
15
  USER user
 
5
  COPY . /code
6
 
7
  RUN apt-get update
8
+ RUN apt-get install -y wget ffmpeg build-essential
9
 
10
  RUN pip install --upgrade pip
11
  RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
 
13
+ RUN bash install.sh --stage 1 --stop_stage 1 --system_version ubuntu
14
+
15
  RUN useradd -m -u 1000 user
16
 
17
  USER user
examples/batch_audio_fmt_convert.py CHANGED
@@ -18,13 +18,13 @@ def get_args():
18
  parser.add_argument(
19
  "--audio_dir",
20
  # default=(project_path / "data/yd").as_posix(),
21
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾",
22
  type=str,
23
  )
24
  parser.add_argument(
25
  "--output_dir",
26
  # default=(project_path / "data/temp_wav").as_posix(),
27
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾",
28
  type=str,
29
  )
30
  args = parser.parse_args()
 
18
  parser.add_argument(
19
  "--audio_dir",
20
  # default=(project_path / "data/yd").as_posix(),
21
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\money_char",
22
  type=str,
23
  )
24
  parser.add_argument(
25
  "--output_dir",
26
  # default=(project_path / "data/temp_wav").as_posix(),
27
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\money_char",
28
  type=str,
29
  )
30
  args = parser.parse_args()
examples/clone/voice_clone.py CHANGED
@@ -6,21 +6,41 @@ import shutil
6
  from gradio_client import Client, handle_file
7
 
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def get_args():
10
  parser = argparse.ArgumentParser()
11
  parser.add_argument(
12
  "--text",
13
- default="吴家豪",
 
 
 
 
14
  type=str,
15
  )
16
  parser.add_argument(
17
  "--reference",
18
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\Chinese_3.wav",
19
  type=str,
20
  )
21
  parser.add_argument(
22
  "--output_file",
23
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2.wav",
24
  type=str,
25
  )
26
  args = parser.parse_args()
@@ -30,17 +50,17 @@ def get_args():
30
  def main():
31
  args = get_args()
32
 
33
- client = Client("https://coqui-xtts.hf.space/--replicas/fib73/")
34
 
35
  _, synthesised_audio, _, _ = client.predict(
36
  args.text,
37
  # "en",
38
  # "es",
39
- # "ja",
40
  # "pt",
41
  # "ko",
42
  # "ar",
43
- "zh-cn",
44
  args.reference,
45
  args.reference,
46
  False, False, True, True,
 
6
  from gradio_client import Client, handle_file
7
 
8
 
9
+ # language1 = "英语"
10
+ # language2 = "English"
11
+ # language1 = "西班牙语"
12
+ # language2 = "Spanish"
13
+ language1 = "日语"
14
+ language2 = "Japanese"
15
+ # language1 = "葡萄牙语"
16
+ # language2 = "Portuguese"
17
+ # language1 = "韩语"
18
+ # language2 = "Korean"
19
+ # language1 = "阿拉伯语"
20
+ # language2 = "Arabic"
21
+ # language1 = "中国台湾"
22
+ # language2 = "Chinese"
23
+
24
+
25
  def get_args():
26
  parser = argparse.ArgumentParser()
27
  parser.add_argument(
28
  "--text",
29
+ # default="thirty-three dollars and seventy-two cents",
30
+ # default="treinta y tres euros con setenta y dos céntimos",
31
+ default="33.72円",
32
+ # default="33.72 درهم",
33
+ # default="三十三元七角两分",
34
  type=str,
35
  )
36
  parser.add_argument(
37
  "--reference",
38
+ default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\{language2}_1.wav",
39
  type=str,
40
  )
41
  parser.add_argument(
42
  "--output_file",
43
+ default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\xtts_v2_{language2.lower()}_2.wav",
44
  type=str,
45
  )
46
  args = parser.parse_args()
 
50
  def main():
51
  args = get_args()
52
 
53
+ client = Client("https://coqui-xtts.hf.space/--replicas/o7bhl/")
54
 
55
  _, synthesised_audio, _, _ = client.predict(
56
  args.text,
57
  # "en",
58
  # "es",
59
+ "ja",
60
  # "pt",
61
  # "ko",
62
  # "ar",
63
+ # "zh-cn",
64
  args.reference,
65
  args.reference,
66
  False, False, True, True,
examples/concat/concat_three_and_adapt_volume.py CHANGED
@@ -8,31 +8,47 @@ import numpy as np
8
  from scipy.io import wavfile
9
 
10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def get_args():
12
  parser = argparse.ArgumentParser()
13
  parser.add_argument(
14
  "--filename1",
15
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\Chinese_1.wav",
16
  type=str,
17
  )
18
  parser.add_argument(
19
  "--filename2",
20
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2.wav",
21
  type=str,
22
  )
23
  parser.add_argument(
24
  "--filename3",
25
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\Chinese_3.wav",
26
  type=str,
27
  )
28
  parser.add_argument(
29
  "--output_adapt_file",
30
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2_volume_adapt.wav",
31
  type=str,
32
  )
33
  parser.add_argument(
34
  "--output_concat_file",
35
- default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2_concat.wav",
36
  type=str,
37
  )
38
  args = parser.parse_args()
 
8
  from scipy.io import wavfile
9
 
10
 
11
+ # language1 = "英语"
12
+ # language2 = "English"
13
+ # language1 = "西班牙语"
14
+ # language2 = "Spanish"
15
+ language1 = "日语"
16
+ language2 = "Japanese"
17
+ # language1 = "葡萄牙语"
18
+ # language2 = "Portuguese"
19
+ # language1 = "韩语"
20
+ # language2 = "Korean"
21
+ # language1 = "阿拉伯语"
22
+ # language2 = "Arabic"
23
+ # language1 = "中国台湾"
24
+ # language2 = "Chinese"
25
+
26
+
27
  def get_args():
28
  parser = argparse.ArgumentParser()
29
  parser.add_argument(
30
  "--filename1",
31
+ default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\{language2}_1.wav",
32
  type=str,
33
  )
34
  parser.add_argument(
35
  "--filename2",
36
+ default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\xtts_v2_{language2.lower()}_2.wav",
37
  type=str,
38
  )
39
  parser.add_argument(
40
  "--filename3",
41
+ default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\{language2}_3.wav",
42
  type=str,
43
  )
44
  parser.add_argument(
45
  "--output_adapt_file",
46
+ default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\xtts_v2_{language2.lower()}_2_volume_adapt.wav",
47
  type=str,
48
  )
49
  parser.add_argument(
50
  "--output_concat_file",
51
+ default=rf"E:\牛信文档\语音克隆\多语种语音克隆\money_num\{language1}\xtts_v2_{language2.lower()}_2_concat.wav",
52
  type=str,
53
  )
54
  args = parser.parse_args()
install.sh ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ # bash install.sh --stage 1 --stop_stage 1 --system_version windows
4
+
5
+
6
+ system_version="centos";
7
+ verbose=true;
8
+ stage=-1
9
+ stop_stage=0
10
+
11
+
12
+ # parse options
13
+ while true; do
14
+ [ -z "${1:-}" ] && break; # break if there are no arguments
15
+ case "$1" in
16
+ --*) name=$(echo "$1" | sed s/^--// | sed s/-/_/g);
17
+ eval '[ -z "${'"$name"'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
18
+ old_value="(eval echo \\$$name)";
19
+ if [ "${old_value}" == "true" ] || [ "${old_value}" == "false" ]; then
20
+ was_bool=true;
21
+ else
22
+ was_bool=false;
23
+ fi
24
+
25
+ # Set the variable to the right value-- the escaped quotes make it work if
26
+ # the option had spaces, like --cmd "queue.pl -sync y"
27
+ eval "${name}=\"$2\"";
28
+
29
+ # Check that Boolean-valued arguments are really Boolean.
30
+ if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
31
+ echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
32
+ exit 1;
33
+ fi
34
+ shift 2;
35
+ ;;
36
+
37
+ *) break;
38
+ esac
39
+ done
40
+
41
+ work_dir="$(pwd)"
42
+ data_dir="${work_dir}/data/impulse_responses"
43
+
44
+ mkdir -p "${data_dir}"
45
+
46
+ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
47
+ $verbose && echo "stage 1: download simulated room impulse responses"
48
+ cd "${data_dir}" || exit 1;
49
+
50
+ # https://www.openslr.org/26/
51
+ wget https://www.openslr.org/resources/26/sim_rir_8k.zip
52
+ wget https://www.openslr.org/resources/26/sim_rir_16k.zip
53
+
54
+ unzip sim_rir_8k.zip
55
+ unzip sim_rir_16k.zip
56
+
57
+ # https://www.openslr.org/28/
58
+ wget https://www.openslr.org/resources/28/rirs_noises.zip
59
+ unzip rirs_noises.zip
60
+
61
+ fi
62
+
main.py CHANGED
@@ -29,6 +29,7 @@ from toolbox.audio_edit.speech_speed import change_speech_speed, engine_to_funct
29
  from toolbox.audio_edit.volume import change_volume, engine_to_function as volume_engine_to_function
30
  from toolbox.audio_edit.augment import mix_speech_and_noise
31
  from toolbox.audio_edit.reverb import reverb, engine_to_function as reverb_engine_to_function
 
32
 
33
 
34
  def get_args():
@@ -42,6 +43,10 @@ def get_args():
42
  return args
43
 
44
 
 
 
 
 
45
  def save_input_audio(sample_rate: int, signal: np.ndarray) -> str:
46
 
47
  temp_audio_dir = Path(tempfile.gettempdir()) / "input_audio"
@@ -548,6 +553,16 @@ def main():
548
  mix_output_audio, mix_log
549
  ],
550
  )
 
 
 
 
 
 
 
 
 
 
551
 
552
  # http://127.0.0.1:7861/
553
  # http://10.75.27.247:7861/
 
29
  from toolbox.audio_edit.volume import change_volume, engine_to_function as volume_engine_to_function
30
  from toolbox.audio_edit.augment import mix_speech_and_noise
31
  from toolbox.audio_edit.reverb import reverb, engine_to_function as reverb_engine_to_function
32
+ from toolbox.os.command import Command
33
 
34
 
35
  def get_args():
 
43
  return args
44
 
45
 
46
+ def shell(cmd: str):
47
+ return Command.popen(cmd)
48
+
49
+
50
  def save_input_audio(sample_rate: int, signal: np.ndarray) -> str:
51
 
52
  temp_audio_dir = Path(tempfile.gettempdir()) / "input_audio"
 
553
  mix_output_audio, mix_log
554
  ],
555
  )
556
+ with gr.TabItem("shell"):
557
+ shell_text = gr.Textbox(label="cmd")
558
+ shell_button = gr.Button("run")
559
+ shell_output = gr.Textbox(label="output")
560
+
561
+ shell_button.click(
562
+ shell,
563
+ inputs=[shell_text,],
564
+ outputs=[shell_output],
565
+ )
566
 
567
  # http://127.0.0.1:7861/
568
  # http://10.75.27.247:7861/
toolbox/audio_edit/reverb.py CHANGED
@@ -1,8 +1,13 @@
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import json
 
 
 
 
4
  from typing import List, Tuple
5
 
 
6
  import numpy as np
7
  import pedalboard
8
  import pyroomacoustics as pra
@@ -15,8 +20,9 @@ def reverb_by_pedalboard(signal: np.ndarray,
15
  width: float = 1.0,
16
  dry_level: float = 0.4,
17
  wet_level: float = 0.6,
18
- freeze_mode: bool = False
19
- ):
 
20
 
21
  board = pedalboard.Pedalboard([
22
  pedalboard.Reverb(
@@ -40,7 +46,8 @@ def reverb_by_pyroomacoustics(signal: np.ndarray,
40
  source_position: Tuple[float, float] = (2.5, 4.5),
41
  microphone_array: List[Tuple[float, float]] = None,
42
  output_microphone_idx: int = 0,
43
- ):
 
44
  # signal: float32, (-1, 1)
45
  if microphone_array is None:
46
  microphone_array = [[1.5, 1.5], [2.5, 1.5]]
@@ -65,9 +72,62 @@ def reverb_by_pyroomacoustics(signal: np.ndarray,
65
  return reverberated_audio
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  engine_to_function = {
69
  "pedalboard": reverb_by_pedalboard,
70
  "pyroomacoustics": reverb_by_pyroomacoustics,
 
71
  }
72
 
73
 
 
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  import json
4
+ import os.path
5
+ import random
6
+ from functools import lru_cache
7
+ from pathlib import Path
8
  from typing import List, Tuple
9
 
10
+ import librosa
11
  import numpy as np
12
  import pedalboard
13
  import pyroomacoustics as pra
 
20
  width: float = 1.0,
21
  dry_level: float = 0.4,
22
  wet_level: float = 0.6,
23
+ freeze_mode: bool = False,
24
+ **kwargs,
25
+ ) -> np.ndarray:
26
 
27
  board = pedalboard.Pedalboard([
28
  pedalboard.Reverb(
 
46
  source_position: Tuple[float, float] = (2.5, 4.5),
47
  microphone_array: List[Tuple[float, float]] = None,
48
  output_microphone_idx: int = 0,
49
+ **kwargs,
50
+ ) -> np.ndarray:
51
  # signal: float32, (-1, 1)
52
  if microphone_array is None:
53
  microphone_array = [[1.5, 1.5], [2.5, 1.5]]
 
72
  return reverberated_audio
73
 
74
 
75
+ def reverb_by_convolve(signal: np.ndarray,
76
+ impulse_response: np.ndarray,
77
+ ) -> np.ndarray:
78
+ reverberant_audio = np.convolve(signal, impulse_response, mode="full")
79
+ reverberant_audio = reverberant_audio[:len(signal)]
80
+
81
+ reverberant_audio = reverberant_audio * (np.max(np.abs(signal)) / np.max(np.abs(reverberant_audio)))
82
+
83
+ return reverberant_audio
84
+
85
+
86
+ slr28_rir_path = r"E:\programmer\asr_datasets\dns-challenge\datasets.impulse_responses\datasets\impulse_responses\SLR28\RIRS_NOISES\simulated_rirs"
87
+ slr28_rir_path = Path(slr28_rir_path)
88
+
89
+
90
+ @lru_cache(maxsize=10)
91
+ def get_slr28_rir_file_list():
92
+ global slr28_rir_path
93
+
94
+ rir_file_list = list()
95
+ for filename in slr28_rir_path.glob("**/*.wav"):
96
+ rir_file_list.append(filename)
97
+ return rir_file_list
98
+
99
+
100
+ def get_rir_file(rir_file: str = None) -> str:
101
+ if rir_file is None:
102
+ rir_file_list = get_slr28_rir_file_list()
103
+
104
+ rir_file = random.sample(rir_file_list, 1)[0]
105
+ elif os.path.isfile(rir_file):
106
+ pass
107
+ else:
108
+ rir_file = slr28_rir_path / rir_file
109
+ rir_file = rir_file.as_posix()
110
+ return rir_file
111
+
112
+
113
+ def reverb_by_slr28(signal: np.ndarray,
114
+ sample_rate: int,
115
+ rir_file: str = None,
116
+ **kwargs,
117
+ ):
118
+ rir_file = get_rir_file(rir_file)
119
+ impulse_response, _ = librosa.load(rir_file, mono=False, sr=sample_rate)
120
+ if impulse_response.ndim != 1:
121
+ raise AssertionError
122
+
123
+ reverberant_audio = reverb_by_convolve(signal, impulse_response)
124
+ return reverberant_audio
125
+
126
+
127
  engine_to_function = {
128
  "pedalboard": reverb_by_pedalboard,
129
  "pyroomacoustics": reverb_by_pyroomacoustics,
130
+ "slr28": reverb_by_slr28,
131
  }
132
 
133