Commit
·
6a6dfa7
1
Parent(s):
be197a9
moddified app.py and changed path for text classifier
Browse files- app.py +1 -1
- synthesize.py +11 -17
app.py
CHANGED
@@ -20,7 +20,7 @@ def synthesize_speech(input_type, text, own_text, speaker_id, embed_type, emotio
|
|
20 |
if embed_type == "bert_embed":
|
21 |
command = f"python3 synthesize.py --text '{selected_text}' --bert_embed 1 --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"
|
22 |
else:
|
23 |
-
command = f"python3 synthesize.py --text '{selected_text}' --emotion_id {emotion_id} --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"
|
24 |
|
25 |
output = subprocess.check_output(command, shell=True)
|
26 |
audio_file = f'output/result/EmoV_DB/{selected_text}.wav'
|
|
|
20 |
if embed_type == "bert_embed":
|
21 |
command = f"python3 synthesize.py --text '{selected_text}' --bert_embed 1 --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"
|
22 |
else:
|
23 |
+
command = f"python3 synthesize.py --text '{selected_text}' --emotion_id {emotion_mapping[emotion_id]} --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"
|
24 |
|
25 |
output = subprocess.check_output(command, shell=True)
|
26 |
audio_file = f'output/result/EmoV_DB/{selected_text}.wav'
|
synthesize.py
CHANGED
@@ -5,9 +5,6 @@ from string import punctuation
|
|
5 |
import torch
|
6 |
|
7 |
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
import yaml
|
12 |
import numpy as np
|
13 |
from torch.utils.data import DataLoader
|
@@ -21,7 +18,7 @@ from text import text_to_sequence
|
|
21 |
|
22 |
from transformers import RobertaTokenizerFast, AutoModel, AutoModelForSequenceClassification
|
23 |
|
24 |
-
ro_model = "/
|
25 |
roberta_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
|
26 |
roberta_model = AutoModelForSequenceClassification.from_pretrained(ro_model)
|
27 |
|
@@ -29,9 +26,6 @@ roberta_model = AutoModelForSequenceClassification.from_pretrained(ro_model)
|
|
29 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
30 |
|
31 |
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
def read_lexicon(lex_path):
|
36 |
lexicon = {}
|
37 |
with open(lex_path) as f:
|
@@ -216,32 +210,32 @@ if __name__ == "__main__":
|
|
216 |
|
217 |
# Get model
|
218 |
model = get_model(args, configs, device, train=False)
|
219 |
-
|
220 |
# Load vocoder
|
221 |
vocoder = get_vocoder(model_config, device)
|
222 |
-
|
223 |
# Preprocess texts
|
224 |
if args.mode == "batch":
|
225 |
# Get dataset
|
226 |
dataset = TextDataset(args.source, preprocess_config)
|
227 |
batchs = DataLoader(
|
228 |
dataset,
|
229 |
-
batch_size=8,
|
230 |
collate_fn=dataset.collate_fn,
|
231 |
)
|
232 |
if args.mode == "single":
|
233 |
|
234 |
if np.array([args.bert_embed]) == 0:
|
235 |
-
|
236 |
-
|
237 |
else:
|
238 |
-
|
239 |
-
|
240 |
-
|
|
|
241 |
ids = raw_texts = [args.text[:100]]
|
242 |
speakers = np.array([args.speaker_id])
|
243 |
-
|
244 |
-
|
245 |
if preprocess_config["preprocessing"]["text"]["language"] == "en":
|
246 |
texts = np.array(
|
247 |
[preprocess_english(args.text, preprocess_config)])
|
|
|
5 |
import torch
|
6 |
|
7 |
|
|
|
|
|
|
|
8 |
import yaml
|
9 |
import numpy as np
|
10 |
from torch.utils.data import DataLoader
|
|
|
18 |
|
19 |
from transformers import RobertaTokenizerFast, AutoModel, AutoModelForSequenceClassification
|
20 |
|
21 |
+
ro_model = "/roberta_pretrained"
|
22 |
roberta_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
|
23 |
roberta_model = AutoModelForSequenceClassification.from_pretrained(ro_model)
|
24 |
|
|
|
26 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
27 |
|
28 |
|
|
|
|
|
|
|
29 |
def read_lexicon(lex_path):
|
30 |
lexicon = {}
|
31 |
with open(lex_path) as f:
|
|
|
210 |
|
211 |
# Get model
|
212 |
model = get_model(args, configs, device, train=False)
|
213 |
+
|
214 |
# Load vocoder
|
215 |
vocoder = get_vocoder(model_config, device)
|
216 |
+
|
217 |
# Preprocess texts
|
218 |
if args.mode == "batch":
|
219 |
# Get dataset
|
220 |
dataset = TextDataset(args.source, preprocess_config)
|
221 |
batchs = DataLoader(
|
222 |
dataset,
|
223 |
+
batch_size=8,
|
224 |
collate_fn=dataset.collate_fn,
|
225 |
)
|
226 |
if args.mode == "single":
|
227 |
|
228 |
if np.array([args.bert_embed]) == 0:
|
229 |
+
emotions = np.array([args.emotion_id])
|
230 |
+
# print(f'FS2 emotions: {emotions}')
|
231 |
else:
|
232 |
+
emotions = get_roberta_emotion_embeddings(
|
233 |
+
roberta_tokenizer, roberta_model, args.text)
|
234 |
+
emotions = torch.argmax(emotions, dim=1).cpu().numpy()
|
235 |
+
# print(f'RoBERTa emotions {emotions}')
|
236 |
ids = raw_texts = [args.text[:100]]
|
237 |
speakers = np.array([args.speaker_id])
|
238 |
+
|
|
|
239 |
if preprocess_config["preprocessing"]["text"]["language"] == "en":
|
240 |
texts = np.array(
|
241 |
[preprocess_english(args.text, preprocess_config)])
|