Added function to convert digits to written numbers, for handling with SpeechT5
Browse files- handler.py +31 -2
handler.py
CHANGED
@@ -5,9 +5,35 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5Hif
|
|
5 |
from datasets import load_dataset
|
6 |
import time
|
7 |
import re
|
|
|
8 |
from typing import Dict, List, Any
|
9 |
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def split_and_recombine_text(text, desired_length=200, max_length=300):
|
12 |
"""Split text it into chunks of a desired length trying to keep sentences intact."""
|
13 |
# normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
|
@@ -107,6 +133,8 @@ class EndpointHandler:
|
|
107 |
given_text = data.get("inputs", "")
|
108 |
|
109 |
start_time = time.time()
|
|
|
|
|
110 |
|
111 |
texts = split_and_recombine_text(given_text)
|
112 |
audios = []
|
@@ -115,7 +143,8 @@ class EndpointHandler:
|
|
115 |
inputs = self.processor(text=t, return_tensors="pt")
|
116 |
speech = self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder)
|
117 |
|
118 |
-
audios.append(speech
|
|
|
119 |
|
120 |
|
121 |
final_speech = np.concatenate(audios)
|
|
|
5 |
from datasets import load_dataset
|
6 |
import time
|
7 |
import re
|
8 |
+
import inflect
|
9 |
from typing import Dict, List, Any
|
10 |
|
11 |
+
def convert_numbers_to_text(input_string):
|
12 |
+
p = inflect.engine()
|
13 |
+
words = input_string.split()
|
14 |
+
new_words = []
|
15 |
+
|
16 |
+
for word in words:
|
17 |
+
|
18 |
+
if word.isdigit() and len(word) == 4: # Check for years (4-digit numbers)
|
19 |
+
year = int(word)
|
20 |
+
if year < 2000:
|
21 |
+
# Split the year into two parts
|
22 |
+
first_part = year // 100
|
23 |
+
second_part = year % 100
|
24 |
+
# Convert each part to words and combine
|
25 |
+
word = p.number_to_words(first_part) + " " + p.number_to_words(second_part)
|
26 |
+
elif year < 9999:
|
27 |
+
# Convert directly for year 2000 and beyond
|
28 |
+
word = p.number_to_words(year)
|
29 |
+
elif word.replace(',','').isdigit(): # Check for any other digits
|
30 |
+
word = word.replace(',','')
|
31 |
+
number = int(word)
|
32 |
+
word = p.number_to_words(number).replace(',', '')
|
33 |
+
new_words.append(word)
|
34 |
+
|
35 |
+
return ' '.join(new_words)
|
36 |
+
|
37 |
def split_and_recombine_text(text, desired_length=200, max_length=300):
|
38 |
"""Split text it into chunks of a desired length trying to keep sentences intact."""
|
39 |
# normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
|
|
|
133 |
given_text = data.get("inputs", "")
|
134 |
|
135 |
start_time = time.time()
|
136 |
+
|
137 |
+
given_text = convert_numbers_to_text(given_text)
|
138 |
|
139 |
texts = split_and_recombine_text(given_text)
|
140 |
audios = []
|
|
|
143 |
inputs = self.processor(text=t, return_tensors="pt")
|
144 |
speech = self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder)
|
145 |
|
146 |
+
audios.append(speech)
|
147 |
+
#audios.append(speech.numpy())
|
148 |
|
149 |
|
150 |
final_speech = np.concatenate(audios)
|