johnbridges commited on
Commit
f12a8b4
·
1 Parent(s): 2e84785

added latest version

Browse files
Files changed (1) hide show
  1. tts_processor.py +23 -2
tts_processor.py CHANGED
@@ -108,10 +108,30 @@ def replace_abbreviations(string):
108
  words[i] = ''.join([alphabet_map.get(char, char) for char in word])
109
  return ' '.join(words)
110
 
111
- # Clean up whitespace in the text
112
  def clean_whitespace(string):
 
113
  string = re.sub(r'\s+([.,?!])', r'\1', string)
114
- return ' '.join(string.split())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
  # Main preprocessing pipeline
117
  def preprocess_all(string):
@@ -119,6 +139,7 @@ def preprocess_all(string):
119
  string = replace_invalid_chars(string)
120
  string = replace_numbers(string)
121
  string = replace_abbreviations(string)
 
122
  string = clean_whitespace(string)
123
  return string
124
 
 
108
  words[i] = ''.join([alphabet_map.get(char, char) for char in word])
109
  return ' '.join(words)
110
 
 
111
  def clean_whitespace(string):
112
+ # Remove spaces before punctuation
113
  string = re.sub(r'\s+([.,?!])', r'\1', string)
114
+ # Collapse multiple spaces into one, but don’t touch inside tokens like "test.com"
115
+ string = re.sub(r'\s{2,}', ' ', string)
116
+ return string.strip()
117
+
118
+ def make_dots_tts_friendly(text):
119
+ # Handle IP addresses (force "dot")
120
+ ipv4_pattern = r'\b\d{1,3}(\.\d{1,3}){3}\b'
121
+ text = re.sub(ipv4_pattern, lambda m: m.group(0).replace('.', ' dot '), text)
122
+
123
+ # Handle domain-like endings (force "dot")
124
+ domain_pattern = r'\b([\w-]+)\.(com|net|org|io|gov|edu|exe|dll|local)\b'
125
+ text = re.sub(domain_pattern, lambda m: m.group(0).replace('.', ' dot '), text)
126
+
127
+ # Handle decimals (use "point")
128
+ decimal_pattern = r'\b\d+\.\d+\b'
129
+ text = re.sub(decimal_pattern, lambda m: m.group(0).replace('.', ' point '), text)
130
+
131
+ # Handle leading dot words (.Net → dot Net)
132
+ text = re.sub(r'\.(?=\w)', 'dot ', text)
133
+
134
+ return text
135
 
136
  # Main preprocessing pipeline
137
  def preprocess_all(string):
 
139
  string = replace_invalid_chars(string)
140
  string = replace_numbers(string)
141
  string = replace_abbreviations(string)
142
+ string = make_dots_tts_friendly(string)
143
  string = clean_whitespace(string)
144
  return string
145