Spaces:

Ionut-Bostan
/

Emotion_Aware_TTS

Running

App Files Files Community

Ionut-Bostan commited on May 4, 2023

Commit

6a6dfa7

1 Parent(s): be197a9

moddified app.py and changed path for text classifier

Browse files

Files changed (2) hide show

app.py +1 -1
synthesize.py +11 -17

app.py CHANGED Viewed

@@ -20,7 +20,7 @@ def synthesize_speech(input_type, text, own_text, speaker_id, embed_type, emotio
     if embed_type == "bert_embed":
         command = f"python3 synthesize.py --text '{selected_text}' --bert_embed 1 --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"
     else:
-        command = f"python3 synthesize.py --text '{selected_text}' --emotion_id {emotion_id} --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"
     output = subprocess.check_output(command, shell=True)
     audio_file = f'output/result/EmoV_DB/{selected_text}.wav'

     if embed_type == "bert_embed":
         command = f"python3 synthesize.py --text '{selected_text}' --bert_embed 1 --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"
     else:
+        command = f"python3 synthesize.py --text '{selected_text}' --emotion_id {emotion_mapping[emotion_id]} --speaker_id {speaker_id} --restore_step 900000 --mode single -p config/EmoV_DB/preprocess.yaml -m config/EmoV_DB/model.yaml -t config/EmoV_DB/train.yaml"
     output = subprocess.check_output(command, shell=True)
     audio_file = f'output/result/EmoV_DB/{selected_text}.wav'

synthesize.py CHANGED Viewed

@@ -5,9 +5,6 @@ from string import punctuation
 import torch
 import yaml
 import numpy as np
 from torch.utils.data import DataLoader
@@ -21,7 +18,7 @@ from text import text_to_sequence
 from transformers import RobertaTokenizerFast, AutoModel, AutoModelForSequenceClassification
-ro_model = "/content/FastSpeech2_Text_Aware_Emotion_TTS/roberta_pretrained"
 roberta_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
 roberta_model = AutoModelForSequenceClassification.from_pretrained(ro_model)
@@ -29,9 +26,6 @@ roberta_model = AutoModelForSequenceClassification.from_pretrained(ro_model)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def read_lexicon(lex_path):
     lexicon = {}
     with open(lex_path) as f:
@@ -216,32 +210,32 @@ if __name__ == "__main__":
     # Get model
     model = get_model(args, configs, device, train=False)
     # Load vocoder
     vocoder = get_vocoder(model_config, device)
     # Preprocess texts
     if args.mode == "batch":
         # Get dataset
         dataset = TextDataset(args.source, preprocess_config)
         batchs = DataLoader(
             dataset,
-            batch_size=8,
             collate_fn=dataset.collate_fn,
         )
     if args.mode == "single":
         if np.array([args.bert_embed]) == 0:
-          emotions = np.array([args.emotion_id])
-          # print(f'FS2 emotions: {emotions}')
         else:
-          emotions = get_roberta_emotion_embeddings(roberta_tokenizer, roberta_model, args.text)
-          emotions = torch.argmax(emotions, dim=1).cpu().numpy()
-          # print(f'RoBERTa emotions {emotions}')
         ids = raw_texts = [args.text[:100]]
         speakers = np.array([args.speaker_id])
         if preprocess_config["preprocessing"]["text"]["language"] == "en":
             texts = np.array(
                 [preprocess_english(args.text, preprocess_config)])

 import torch
 import yaml
 import numpy as np
 from torch.utils.data import DataLoader
 from transformers import RobertaTokenizerFast, AutoModel, AutoModelForSequenceClassification
+ro_model = "/roberta_pretrained"
 roberta_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
 roberta_model = AutoModelForSequenceClassification.from_pretrained(ro_model)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 def read_lexicon(lex_path):
     lexicon = {}
     with open(lex_path) as f:
     # Get model
     model = get_model(args, configs, device, train=False)
     # Load vocoder
     vocoder = get_vocoder(model_config, device)
     # Preprocess texts
     if args.mode == "batch":
         # Get dataset
         dataset = TextDataset(args.source, preprocess_config)
         batchs = DataLoader(
             dataset,
+            batch_size=8,
             collate_fn=dataset.collate_fn,
         )
     if args.mode == "single":
         if np.array([args.bert_embed]) == 0:
+            emotions = np.array([args.emotion_id])
+            # print(f'FS2 emotions: {emotions}')
         else:
+            emotions = get_roberta_emotion_embeddings(
+                roberta_tokenizer, roberta_model, args.text)
+            emotions = torch.argmax(emotions, dim=1).cpu().numpy()
+            # print(f'RoBERTa emotions {emotions}')
         ids = raw_texts = [args.text[:100]]
         speakers = np.array([args.speaker_id])
         if preprocess_config["preprocessing"]["text"]["language"] == "en":
             texts = np.array(
                 [preprocess_english(args.text, preprocess_config)])