HoneyTian commited on
Commit
17263d1
·
1 Parent(s): 846c06f
examples/batch_audio_fmt_convert.py CHANGED
@@ -5,6 +5,8 @@ import argparse
5
  import librosa
6
  import numpy as np
7
  from pathlib import Path
 
 
8
  from scipy.io import wavfile
9
  from tqdm import tqdm
10
 
@@ -15,12 +17,14 @@ def get_args():
15
  parser = argparse.ArgumentParser()
16
  parser.add_argument(
17
  "--audio_dir",
18
- default=(project_path / "data/yd").as_posix(),
 
19
  type=str,
20
  )
21
  parser.add_argument(
22
  "--output_dir",
23
- default=(project_path / "data/temp_wav").as_posix(),
 
24
  type=str,
25
  )
26
  args = parser.parse_args()
 
5
  import librosa
6
  import numpy as np
7
  from pathlib import Path
8
+
9
+ from fsspec.registry import default
10
  from scipy.io import wavfile
11
  from tqdm import tqdm
12
 
 
17
  parser = argparse.ArgumentParser()
18
  parser.add_argument(
19
  "--audio_dir",
20
+ # default=(project_path / "data/yd").as_posix(),
21
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾",
22
  type=str,
23
  )
24
  parser.add_argument(
25
  "--output_dir",
26
+ # default=(project_path / "data/temp_wav").as_posix(),
27
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾",
28
  type=str,
29
  )
30
  args = parser.parse_args()
examples/clone/voice_clone.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import shutil
5
+
6
+ from gradio_client import Client, handle_file
7
+
8
+
9
+ def get_args():
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument(
12
+ "--text",
13
+ default="吴家豪",
14
+ type=str,
15
+ )
16
+ parser.add_argument(
17
+ "--reference",
18
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\Chinese_3.wav",
19
+ type=str,
20
+ )
21
+ parser.add_argument(
22
+ "--output_file",
23
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2.wav",
24
+ type=str,
25
+ )
26
+ args = parser.parse_args()
27
+ return args
28
+
29
+
30
+ def main():
31
+ args = get_args()
32
+
33
+ client = Client("https://coqui-xtts.hf.space/--replicas/fib73/")
34
+
35
+ _, synthesised_audio, _, _ = client.predict(
36
+ args.text,
37
+ # "en",
38
+ # "es",
39
+ # "ja",
40
+ # "pt",
41
+ # "ko",
42
+ # "ar",
43
+ "zh-cn",
44
+ args.reference,
45
+ args.reference,
46
+ False, False, True, True,
47
+ fn_index=1
48
+ )
49
+
50
+ shutil.move(
51
+ synthesised_audio,
52
+ args.output_file
53
+ )
54
+ return
55
+
56
+
57
+ if __name__ == '__main__':
58
+ main()
examples/concat/batch_concat_three_and_adapt_volume.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+
5
+ from gradio_client import Client, handle_file
6
+ import numpy as np
7
+ from scipy.io import wavfile
8
+
9
+
10
+ def get_args():
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument(
13
+ "--filename1",
14
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\英语\English_1.wav",
15
+ type=str,
16
+ )
17
+ parser.add_argument(
18
+ "--filename2",
19
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\英语\xtts_v2_english_2.wav",
20
+ type=str,
21
+ )
22
+ parser.add_argument(
23
+ "--filename3",
24
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\英语\English_3.wav",
25
+ type=str,
26
+ )
27
+ args = parser.parse_args()
28
+ return args
29
+
30
+
31
+ def main():
32
+ args = get_args()
33
+
34
+ client = Client("http://10.75.27.247:7861/")
35
+ new_filename2, _ = client.predict(
36
+ audio_t=handle_file(args.filename2),
37
+ radio=1,
38
+ decibel=0,
39
+ reference=handle_file(args.filename3),
40
+ engine="by_ffmpy_by_db",
41
+ api_name="/when_click_change_volume"
42
+ )
43
+
44
+ _, signal1 = wavfile.read(args.filename1)
45
+ _, signal2 = wavfile.read(new_filename2)
46
+ _, signal3 = wavfile.read(args.filename3)
47
+
48
+ signal = np.concat([signal1, signal2, signal3], axis=0)
49
+ print(signal.shape)
50
+ return
51
+
52
+
53
+ if __name__ == '__main__':
54
+ main()
examples/concat/concat_three_and_adapt_volume.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ # -*- coding: utf-8 -*-
3
+ import argparse
4
+ import shutil
5
+
6
+ from gradio_client import Client, handle_file
7
+ import numpy as np
8
+ from scipy.io import wavfile
9
+
10
+
11
+ def get_args():
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument(
14
+ "--filename1",
15
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\Chinese_1.wav",
16
+ type=str,
17
+ )
18
+ parser.add_argument(
19
+ "--filename2",
20
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2.wav",
21
+ type=str,
22
+ )
23
+ parser.add_argument(
24
+ "--filename3",
25
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\Chinese_3.wav",
26
+ type=str,
27
+ )
28
+ parser.add_argument(
29
+ "--output_adapt_file",
30
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2_volume_adapt.wav",
31
+ type=str,
32
+ )
33
+ parser.add_argument(
34
+ "--output_concat_file",
35
+ default=r"E:\牛信文档\语音克隆\多语种语音克隆\voice\中国台湾\xtts_v2_chinese_2_concat.wav",
36
+ type=str,
37
+ )
38
+ args = parser.parse_args()
39
+ return args
40
+
41
+
42
+ def main():
43
+ args = get_args()
44
+
45
+ # client = Client("http://10.75.27.247:7861/")
46
+ client = Client("http://127.0.0.1:7861/")
47
+
48
+ new_filename2, _ = client.predict(
49
+ audio_t=handle_file(args.filename2),
50
+ radio=1,
51
+ decibel=0,
52
+ reference=handle_file(args.filename3),
53
+ engine="by_pydub_by_reference",
54
+ api_name="/when_click_change_volume"
55
+ )
56
+
57
+ new_filename2, _, _, _ = client.predict(
58
+ audio_t=handle_file(new_filename2),
59
+ to_sample_rate=8000,
60
+ sample_width=2,
61
+ channels="0",
62
+ engine="librosa",
63
+ api_name="/when_click_audio_convert"
64
+ )
65
+
66
+ _, signal1 = wavfile.read(args.filename1)
67
+ _, signal2 = wavfile.read(new_filename2)
68
+ _, signal3 = wavfile.read(args.filename3)
69
+
70
+ signal = np.concat([signal1, signal2, signal3], axis=0)
71
+
72
+ shutil.move(
73
+ new_filename2,
74
+ args.output_adapt_file
75
+ )
76
+ wavfile.write(
77
+ args.output_concat_file,
78
+ 8000,
79
+ signal,
80
+ )
81
+ return
82
+
83
+
84
+ if __name__ == '__main__':
85
+ main()
examples/concat/test1.py DELETED
@@ -1,48 +0,0 @@
1
- #!/usr/bin/python3
2
- # -*- coding: utf-8 -*-
3
- import argparse
4
-
5
- import os
6
- from ffmpy import FFmpeg
7
-
8
-
9
- def get_args():
10
- parser = argparse.ArgumentParser()
11
- parser.add_argument("--change_by_db", default=-11, type=int)
12
- args = parser.parse_args()
13
- return args
14
-
15
-
16
- def change_by_decibel(audio_path: str, output_file: str, decibel):
17
- ext = os.path.basename(audio_path).strip().split(".")[-1]
18
- if ext not in ["wav", "mp3"]:
19
- raise Exception("format error")
20
- if os.path.exists(output_file):
21
- os.remove(output_file)
22
- ff = FFmpeg(
23
- inputs={audio_path: None},
24
- outputs={output_file: f'-filter:a "volume={decibel}dB"'}
25
- )
26
- ff.run()
27
- return output_file
28
-
29
-
30
- def main():
31
- args = get_args()
32
-
33
- for i in range(10):
34
- filename = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2.wav".format(i)
35
- output_file = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2_volume.wav".format(i)
36
-
37
- output_file = change_by_decibel(
38
- filename,
39
- output_file,
40
- args.change_by_db,
41
- )
42
- # print(f"output_file: {output_file}")
43
-
44
- return
45
-
46
-
47
- if __name__ == "__main__":
48
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
examples/concat/test2.py DELETED
@@ -1,51 +0,0 @@
1
- #!/usr/bin/python3
2
- # -*- coding: utf-8 -*-
3
- import librosa
4
- import numpy as np
5
- from scipy.io import wavfile
6
-
7
-
8
- for i in range(10):
9
- filename1 = r"C:\Users\tianx\Desktop\Audio\x_tts_v2\audio_0_section_1.wav"
10
- filename2 = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2_volume.wav".format(i)
11
- filename3 = r"C:\Users\tianx\Desktop\Audio\x_tts_v2\audio_0_section_2.wav"
12
-
13
- output_filename = r"C:\Users\tianx\Desktop\Audio\open_voice_v2\audio_{}_3_clone_from_audio_0_2_concat_volume.wav".format(i)
14
-
15
- signal1, sample_rate = librosa.load(filename1, sr=8000)
16
-
17
- print(sample_rate)
18
- print(signal1.dtype)
19
- print(signal1.shape)
20
-
21
- signal2, sample_rate = librosa.load(filename2, sr=8000)
22
-
23
- print(sample_rate)
24
- print(signal2.dtype)
25
- print(signal2.shape)
26
-
27
- signal3, sample_rate = librosa.load(filename3, sr=8000)
28
-
29
- print(sample_rate)
30
- print(signal3.dtype)
31
- print(signal3.shape)
32
-
33
- signal = np.concatenate([signal1, signal2, signal3], dtype=np.float32)
34
- print(signal.dtype)
35
- print(signal.shape)
36
-
37
- max_wave_value = 32768.0
38
- signal *= max_wave_value
39
- signal = np.array(signal, dtype=np.int16)
40
- print(signal.dtype)
41
- print(sample_rate)
42
-
43
- wavfile.write(
44
- output_filename,
45
- 8000,
46
- signal,
47
- )
48
-
49
-
50
- if __name__ == '__main__':
51
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
main.py CHANGED
@@ -1,14 +1,14 @@
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  """
4
- docker build -t audio_edit:v20250109_1615 .
5
 
6
  docker run -itd \
7
  --name audio_edit_7861 \
8
  --restart=always \
9
  --network host \
10
  -e port=7861 \
11
- audio_edit:v20250109_1615
12
  """
13
  import argparse
14
  import json
 
1
  #!/usr/bin/python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ docker build -t audio_edit:v20250116_1917 .
5
 
6
  docker run -itd \
7
  --name audio_edit_7861 \
8
  --restart=always \
9
  --network host \
10
  -e port=7861 \
11
+ audio_edit:v20250116_1917
12
  """
13
  import argparse
14
  import json
toolbox/audio_edit/convert.py CHANGED
@@ -55,6 +55,10 @@ def audio_convert_by_librosa(filename: str,
55
  signal = np.concatenate(signal_, axis=-1)
56
 
57
  if sample_width == 2:
 
 
 
 
58
  max_wave_value = 32768.0
59
  signal *= max_wave_value
60
  signal = np.array(signal, dtype=np.int16)
 
55
  signal = np.concatenate(signal_, axis=-1)
56
 
57
  if sample_width == 2:
58
+ scale = np.max([np.abs(np.max(signal)), np.abs(np.min(signal))])
59
+ if scale > 1:
60
+ signal /= scale
61
+
62
  max_wave_value = 32768.0
63
  signal *= max_wave_value
64
  signal = np.array(signal, dtype=np.int16)