Spaces:
Running
Running
File size: 8,904 Bytes
1337d7e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 |
import re
from phonemizer import backend
from typing import List
class Tokenizer:
def __init__(self):
self.VOCAB = self._get_vocab()
self.phonemizers = {
'en-us': backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
'en-gb': backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
}
@staticmethod
def _get_vocab():
"""
Generates a mapping of symbols to integer indices for tokenization.
Returns:
dict: A dictionary where keys are symbols and values are unique integer indices.
"""
# Define the symbols
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = (
"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
)
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
# Create a dictionary mapping each symbol to its index
return {symbol: index for index, symbol in enumerate(symbols)}
@staticmethod
def split_num(num: re.Match) -> str:
"""
Processes numeric strings, formatting them as time, years, or other representations.
Args:
num (re.Match): A regex match object representing the numeric string.
Returns:
str: A formatted string based on the numeric input.
"""
num = num.group()
# Handle time (e.g., "12:30")
if ':' in num:
hours, minutes = map(int, num.split(':'))
if minutes == 0:
return f"{hours} o'clock"
elif minutes < 10:
return f'{hours} oh {minutes}'
return f'{hours} {minutes}'
# Handle years or general numeric cases
year = int(num[:4])
if year < 1100 or year % 1000 < 10:
return num
left, right = num[:2], int(num[2:4])
suffix = 's' if num.endswith('s') else ''
# Format years
if 100 <= year % 1000 <= 999:
if right == 0:
return f'{left} hundred{suffix}'
elif right < 10:
return f'{left} oh {right}{suffix}'
return f'{left} {right}{suffix}'
@staticmethod
def flip_money(match: re.Match) -> str:
"""
Converts monetary values to a textual representation.
Args:
m (re.Match): A regex match object representing the monetary value.
Returns:
str: A formatted string describing the monetary value.
"""
m = m.group()
currency = 'dollar' if m[0] == '$' else 'pound'
# Handle whole amounts (e.g., "$10", "£20")
if '.' not in m:
singular = '' if m[1:] == '1' else 's'
return f'{m[1:]} {currency}{singular}'
# Handle amounts with decimals (e.g., "$10.50", "£5.25")
whole, cents = m[1:].split('.')
singular = '' if whole == '1' else 's'
cents = int(cents.ljust(2, '0')) # Ensure 2 decimal places
coins = f"cent{'' if cents == 1 else 's'}" if m[0] == '$' else ('penny' if cents == 1 else 'pence')
return f'{whole} {currency}{singular} and {cents} {coins}'
@staticmethod
def point_num(match):
whole, fractional = match.group().split('.')
return ' point '.join([whole, ' '.join(fractional)])
def normalize_text(self, text: str) -> str:
"""
Normalizes input text by replacing special characters, punctuation, and applying custom transformations.
Args:
text (str): Input text to normalize.
Returns:
str: Normalized text.
"""
# Replace specific characters with standardized versions
replacements = {
chr(8216): "'", # Left single quotation mark
chr(8217): "'", # Right single quotation mark
'«': chr(8220), # Left double angle quotation mark to left double quotation mark
'»': chr(8221), # Right double angle quotation mark to right double quotation mark
chr(8220): '"', # Left double quotation mark
chr(8221): '"', # Right double quotation mark
'(': '«', # Replace parentheses with angle quotation marks
')': '»'
}
for old, new in replacements.items():
text = text.replace(old, new)
# Replace punctuation and add spaces
punctuation_replacements = {
'、': ',',
'。': '.',
'!': '!',
',': ',',
':': ':',
';': ';',
'?': '?',
}
for old, new in punctuation_replacements.items():
text = text.replace(old, new + ' ')
# Apply regex-based replacements
text = re.sub(r'[^\S\n]', ' ', text)
text = re.sub(r' +', ' ', text)
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
# Expand abbreviations and handle special cases
abbreviation_patterns = [
(r'\bD[Rr]\.(?= [A-Z])', 'Doctor'),
(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister'),
(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss'),
(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs'),
(r'\betc\.(?! [A-Z])', 'etc'),
(r'(?i)\b(y)eah?\b', r"\1e'a"),
]
for pattern, replacement in abbreviation_patterns:
text = re.sub(pattern, replacement, text)
# Handle numbers and monetary values
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', self.split_num, text)
text = re.sub(r'(?<=\d),(?=\d)', '', text) # Remove commas from numbers
text = re.sub(
r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b',
self.flip_money,
text
)
text = re.sub(r'\d*\.\d+', self.point_num, text)
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text)
# Handle possessives and specific letter cases
text = re.sub(r'(?<=\d)S', ' S', text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", 's', text)
# Handle abbreviations with dots
text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
return text.strip()
def tokenize(self, phonemes: str) -> List[int]:
"""
Tokenizes a given string into a list of indices based on VOCAB.
Args:
text (str): Input string to tokenize.
Returns:
list: A list of integer indices corresponding to the characters in the input string.
"""
return [self.VOCAB[x] for x in phonemes if x in self.VOCAB]
def phonemize(self, text: str, lang: str = 'en-us', normalize: bool = True) -> str:
"""
Converts text to phonemes using the specified language phonemizer and applies normalization.
Args:
text (str): Input text to be phonemized.
lang (str): Language identifier ('en-us' or 'en-gb') for selecting the phonemizer.
normalize (bool): Whether to normalize the text before phonemization.
Returns:
str: A processed string of phonemes.
"""
# Normalize text if required
if normalize:
text = self.normalize_text(text)
# Generate phonemes using the specified phonemizer
if lang not in self.phonemizers:
print(f"Language '{lang}' not supported. Defaulting to 'en-us'.")
lang = 'en-us'
phonemes = self.phonemizers[lang].phonemize([text])
phonemes = phonemes[0] if phonemes else ''
# Apply custom phoneme replacements
replacements = {
'kəkˈoːɹoʊ': 'kˈoʊkəɹoʊ',
'kəkˈɔːɹəʊ': 'kˈəʊkəɹəʊ',
'ʲ': 'j',
'r': 'ɹ',
'x': 'k',
'ɬ': 'l',
}
for old, new in replacements.items():
phonemes = phonemes.replace(old, new)
# Apply regex-based replacements
phonemes = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', phonemes)
phonemes = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', phonemes)
# Additional language-specific rules
if lang == 'a':
phonemes = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', phonemes)
# Filter out characters not in VOCAB
phonemes = ''.join(filter(lambda p: p in self.VOCAB, phonemes))
return phonemes.strip()
|