Spaces:
Running
Running
Commit
·
f12a8b4
1
Parent(s):
2e84785
added latest version
Browse files- tts_processor.py +23 -2
tts_processor.py
CHANGED
@@ -108,10 +108,30 @@ def replace_abbreviations(string):
|
|
108 |
words[i] = ''.join([alphabet_map.get(char, char) for char in word])
|
109 |
return ' '.join(words)
|
110 |
|
111 |
-
# Clean up whitespace in the text
|
112 |
def clean_whitespace(string):
|
|
|
113 |
string = re.sub(r'\s+([.,?!])', r'\1', string)
|
114 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
115 |
|
116 |
# Main preprocessing pipeline
|
117 |
def preprocess_all(string):
|
@@ -119,6 +139,7 @@ def preprocess_all(string):
|
|
119 |
string = replace_invalid_chars(string)
|
120 |
string = replace_numbers(string)
|
121 |
string = replace_abbreviations(string)
|
|
|
122 |
string = clean_whitespace(string)
|
123 |
return string
|
124 |
|
|
|
108 |
words[i] = ''.join([alphabet_map.get(char, char) for char in word])
|
109 |
return ' '.join(words)
|
110 |
|
|
|
111 |
def clean_whitespace(string):
|
112 |
+
# Remove spaces before punctuation
|
113 |
string = re.sub(r'\s+([.,?!])', r'\1', string)
|
114 |
+
# Collapse multiple spaces into one, but don’t touch inside tokens like "test.com"
|
115 |
+
string = re.sub(r'\s{2,}', ' ', string)
|
116 |
+
return string.strip()
|
117 |
+
|
118 |
+
def make_dots_tts_friendly(text):
|
119 |
+
# Handle IP addresses (force "dot")
|
120 |
+
ipv4_pattern = r'\b\d{1,3}(\.\d{1,3}){3}\b'
|
121 |
+
text = re.sub(ipv4_pattern, lambda m: m.group(0).replace('.', ' dot '), text)
|
122 |
+
|
123 |
+
# Handle domain-like endings (force "dot")
|
124 |
+
domain_pattern = r'\b([\w-]+)\.(com|net|org|io|gov|edu|exe|dll|local)\b'
|
125 |
+
text = re.sub(domain_pattern, lambda m: m.group(0).replace('.', ' dot '), text)
|
126 |
+
|
127 |
+
# Handle decimals (use "point")
|
128 |
+
decimal_pattern = r'\b\d+\.\d+\b'
|
129 |
+
text = re.sub(decimal_pattern, lambda m: m.group(0).replace('.', ' point '), text)
|
130 |
+
|
131 |
+
# Handle leading dot words (.Net → dot Net)
|
132 |
+
text = re.sub(r'\.(?=\w)', 'dot ', text)
|
133 |
+
|
134 |
+
return text
|
135 |
|
136 |
# Main preprocessing pipeline
|
137 |
def preprocess_all(string):
|
|
|
139 |
string = replace_invalid_chars(string)
|
140 |
string = replace_numbers(string)
|
141 |
string = replace_abbreviations(string)
|
142 |
+
string = make_dots_tts_friendly(string)
|
143 |
string = clean_whitespace(string)
|
144 |
return string
|
145 |
|