Dupaja
/

speecht5_tts

@@ -4,9 +4,80 @@ import torch
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from datasets import load_dataset
 import time
 from typing import Dict, List, Any
 class EndpointHandler:
     def __init__(self, path=""):
@@ -37,27 +108,26 @@ class EndpointHandler:
         start_time = time.time()
-        inputs = self.processor(text=given_text, return_tensors="pt")
-        run_time_processor = time.time() - start_time
-        start_time_speech = time.time()
-        speech = self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder)
-        run_time_speech = time.time() - start_time_speech
         run_time_total = time.time() - start_time
         # Return the expected response format
         return {
             "statusCode": 200,
             "body": {
-                "audio": speech.numpy(),  # Consider encoding this to a suitable format
                 "sampling_rate": 16000,
-                "run_time_processor": str(run_time_processor),
-                "run_time_speech": str(run_time_speech),
                 "run_time_total": str(run_time_total),
             }
         }

 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from datasets import load_dataset
 import time
+import re
 from typing import Dict, List, Any
+#from tourtise utils
+def split_and_recombine_text(text, desired_length=200, max_length=300):
+    """Split text it into chunks of a desired length trying to keep sentences intact."""
+    # normalize text, remove redundant whitespace and convert non-ascii quotes to ascii
+    text = re.sub(r'\n\n+', '\n', text)
+    text = re.sub(r'\s+', ' ', text)
+    text = re.sub(r'[“”]', '"', text)
+    rv = []
+    in_quote = False
+    current = ""
+    split_pos = []
+    pos = -1
+    end_pos = len(text) - 1
+    def seek(delta):
+        nonlocal pos, in_quote, current
+        is_neg = delta < 0
+        for _ in range(abs(delta)):
+            if is_neg:
+                pos -= 1
+                current = current[:-1]
+            else:
+                pos += 1
+                current += text[pos]
+            if text[pos] == '"':
+                in_quote = not in_quote
+        return text[pos]
+    def peek(delta):
+        p = pos + delta
+        return text[p] if p < end_pos and p >= 0 else ""
+    def commit():
+        nonlocal rv, current, split_pos
+        rv.append(current)
+        current = ""
+        split_pos = []
+    while pos < end_pos:
+        c = seek(1)
+        # do we need to force a split?
+        if len(current) >= max_length:
+            if len(split_pos) > 0 and len(current) > (desired_length / 2):
+                # we have at least one sentence and we are over half the desired length, seek back to the last split
+                d = pos - split_pos[-1]
+                seek(-d)
+            else:
+                # no full sentences, seek back until we are not in the middle of a word and split there
+                while c not in '!?.\n ' and pos > 0 and len(current) > desired_length:
+                    c = seek(-1)
+            commit()
+        # check for sentence boundaries
+        elif not in_quote and (c in '!?\n' or (c == '.' and peek(1) in '\n ')):
+            # seek forward if we have consecutive boundary markers but still within the max length
+            while pos < len(text) - 1 and len(current) < max_length and peek(1) in '!?.':
+                c = seek(1)
+            split_pos.append(pos)
+            if len(current) >= desired_length:
+                commit()
+        # treat end of quote as a boundary if its followed by a space or newline
+        elif in_quote and peek(1) == '"' and peek(2) in '\n ':
+            seek(2)
+            split_pos.append(pos)
+    rv.append(current)
+    # clean up, remove lines with only whitespace or punctuation
+    rv = [s.strip() for s in rv]
+    rv = [s for s in rv if len(s) > 0 and not re.match(r'^[\s\.,;:!?]*$', s)]
+    return rv
 class EndpointHandler:
     def __init__(self, path=""):
         start_time = time.time()
+        texts = split_and_recombine_text(given_text)
+        audios = []
+        for t in progress.tqdm(texts):
+            inputs = self.processor(text=t, return_tensors="pt")
+            speech = self.model.generate_speech(inputs["input_ids"], self.speaker_embeddings, vocoder=self.vocoder)
+            audios.append(speech.numpy())
+        final_speech = np.concatenate(audios)
         run_time_total = time.time() - start_time
         # Return the expected response format
         return {
             "statusCode": 200,
             "body": {
+                "audio": final_speech,  # Consider encoding this to a suitable format
                 "sampling_rate": 16000,
                 "run_time_total": str(run_time_total),
             }
         }