Dupaja
/

speecht5_tts

@@ -5,9 +5,35 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5Hif
 from datasets import load_dataset
 import time
 import re
 from typing import Dict, List, Any
-#from tourtise utils
 def split_and_recombine_text(text, desired_length=200, max_length=300):
     """Split text it into chunks of a desired length trying to keep sentences intact."""
     # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
@@ -107,6 +133,8 @@ class EndpointHandler:
         given_text = data.get("inputs", "")
         start_time = time.time()
         texts = split_and_recombine_text(given_text)
         audios = []
@@ -115,7 +143,8 @@ class EndpointHandler:
             inputs = self.processor(text=t, return_tensors="pt")
             speech = self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder)
-            audios.append(speech.numpy())
         final_speech = np.concatenate(audios)

 from datasets import load_dataset
 import time
 import re
+import inflect
 from typing import Dict, List, Any
+def convert_numbers_to_text(input_string):
+    p = inflect.engine()
+    words = input_string.split()
+    new_words = []
+    for word in words:
+        if word.isdigit() and len(word) == 4:  # Check for years (4-digit numbers)
+            year = int(word)
+            if year < 2000:
+                # Split the year into two parts
+                first_part = year // 100
+                second_part = year % 100
+                # Convert each part to words and combine
+                word = p.number_to_words(first_part) + " " + p.number_to_words(second_part)
+            elif year < 9999:
+                # Convert directly for year 2000 and beyond
+                word = p.number_to_words(year)
+        elif word.replace(',','').isdigit():  # Check for any other digits
+            word = word.replace(',','')
+            number = int(word)
+            word = p.number_to_words(number).replace(',', '')
+        new_words.append(word)
+    return ' '.join(new_words)
 def split_and_recombine_text(text, desired_length=200, max_length=300):
     """Split text it into chunks of a desired length trying to keep sentences intact."""
     # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
         given_text = data.get("inputs", "")
         start_time = time.time()
+        given_text = convert_numbers_to_text(given_text)
         texts = split_and_recombine_text(given_text)
         audios = []
             inputs = self.processor(text=t, return_tensors="pt")
             speech = self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder)
+            audios.append(speech)
+            #audios.append(speech.numpy())
         final_speech = np.concatenate(audios)