Spaces:
Running
Running
File size: 8,904 Bytes
1337d7e |
|
import re
from phonemizer import backend
from typing import List
class Tokenizer:
def __init__(self):
self.VOCAB = self._get_vocab()
self.phonemizers = {
'en-us': backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
'en-gb': backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
}
@staticmethod
def _get_vocab():
"""
Generates a mapping of symbols to integer indices for tokenization.
Returns:
dict: A dictionary where keys are symbols and values are unique integer indices.
"""
# Define the symbols
_pad = "$"
_punctuation = ';:,.!?¡¿—…"«»“” '
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
_letters_ipa = (
"ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
)
symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)
# Create a dictionary mapping each symbol to its index
return {symbol: index for index, symbol in enumerate(symbols)}
@staticmethod
def split_num(num: re.Match) -> str:
"""
Processes numeric strings, formatting them as time, years, or other representations.
Args:
num (re.Match): A regex match object representing the numeric string.
Returns:
str: A formatted string based on the numeric input.
"""
num = num.group()
# Handle time (e.g., "12:30")
if ':' in num:
hours, minutes = map(int, num.split(':'))
if minutes == 0:
return f"{hours} o'clock"
elif minutes < 10:
return f'{hours} oh {minutes}'
return f'{hours} {minutes}'
# Handle years or general numeric cases
year = int(num[:4])
if year < 1100 or year % 1000 < 10:
return num
left, right = num[:2], int(num[2:4])
suffix = 's' if num.endswith('s') else ''
# Format years
if 100 <= year % 1000 <= 999:
if right == 0:
return f'{left} hundred{suffix}'
elif right < 10:
return f'{left} oh {right}{suffix}'
return f'{left} {right}{suffix}'
@staticmethod
def flip_money(match: re.Match) -> str:
"""
Converts monetary values to a textual representation.
Args:
m (re.Match): A regex match object representing the monetary value.
Returns:
str: A formatted string describing the monetary value.
"""
m = m.group()
currency = 'dollar' if m[0] == '$' else 'pound'
# Handle whole amounts (e.g., "$10", "£20")
if '.' not in m:
singular = '' if m[1:] == '1' else 's'
return f'{m[1:]} {currency}{singular}'
# Handle amounts with decimals (e.g., "$10.50", "£5.25")
whole, cents = m[1:].split('.')
singular = '' if whole == '1' else 's'
cents = int(cents.ljust(2, '0')) # Ensure 2 decimal places
coins = f"cent{'' if cents == 1 else 's'}" if m[0] == '$' else ('penny' if cents == 1 else 'pence')
return f'{whole} {currency}{singular} and {cents} {coins}'
@staticmethod
def point_num(match):
whole, fractional = match.group().split('.')
return ' point '.join([whole, ' '.join(fractional)])
def normalize_text(self, text: str) -> str:
"""
Normalizes input text by replacing special characters, punctuation, and applying custom transformations.
Args:
text (str): Input text to normalize.
Returns:
str: Normalized text.
"""
# Replace specific characters with standardized versions
replacements = {
chr(8216): "'", # Left single quotation mark
chr(8217): "'", # Right single quotation mark
'«': chr(8220), # Left double angle quotation mark to left double quotation mark
'»': chr(8221), # Right double angle quotation mark to right double quotation mark
chr(8220): '"', # Left double quotation mark
chr(8221): '"', # Right double quotation mark
'(': '«', # Replace parentheses with angle quotation marks
')': '»'
}
for old, new in replacements.items():
text = text.replace(old, new)
# Replace punctuation and add spaces
punctuation_replacements = {
'、': ',',
'。': '.',
'!': '!',
',': ',',
':': ':',
';': ';',
'?': '?',
}
for old, new in punctuation_replacements.items():
text = text.replace(old, new + ' ')
# Apply regex-based replacements
text = re.sub(r'[^\S\n]', ' ', text)
text = re.sub(r' +', ' ', text)
text = re.sub(r'(?<=\n) +(?=\n)', '', text)
# Expand abbreviations and handle special cases
abbreviation_patterns = [
(r'\bD[Rr]\.(?= [A-Z])', 'Doctor'),
(r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister'),
(r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss'),
(r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs'),
(r'\betc\.(?! [A-Z])', 'etc'),
(r'(?i)\b(y)eah?\b', r"\1e'a"),
]
for pattern, replacement in abbreviation_patterns:
text = re.sub(pattern, replacement, text)
# Handle numbers and monetary values
text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', self.split_num, text)
text = re.sub(r'(?<=\d),(?=\d)', '', text) # Remove commas from numbers
text = re.sub(
r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b',
self.flip_money,
text
)
text = re.sub(r'\d*\.\d+', self.point_num, text)
text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text)
# Handle possessives and specific letter cases
text = re.sub(r'(?<=\d)S', ' S', text)
text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
text = re.sub(r"(?<=X')S\b", 's', text)
# Handle abbreviations with dots
text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)
return text.strip()
def tokenize(self, phonemes: str) -> List[int]:
"""
Tokenizes a given string into a list of indices based on VOCAB.
Args:
text (str): Input string to tokenize.
Returns:
list: A list of integer indices corresponding to the characters in the input string.
"""
return [self.VOCAB[x] for x in phonemes if x in self.VOCAB]
def phonemize(self, text: str, lang: str = 'en-us', normalize: bool = True) -> str:
"""
Converts text to phonemes using the specified language phonemizer and applies normalization.
Args:
text (str): Input text to be phonemized.
lang (str): Language identifier ('en-us' or 'en-gb') for selecting the phonemizer.
normalize (bool): Whether to normalize the text before phonemization.
Returns:
str: A processed string of phonemes.
"""
# Normalize text if required
if normalize:
text = self.normalize_text(text)
# Generate phonemes using the specified phonemizer
if lang not in self.phonemizers:
print(f"Language '{lang}' not supported. Defaulting to 'en-us'.")
lang = 'en-us'
phonemes = self.phonemizers[lang].phonemize([text])
phonemes = phonemes[0] if phonemes else ''
# Apply custom phoneme replacements
replacements = {
'kəkˈoːɹoʊ': 'kˈoʊkəɹoʊ',
'kəkˈɔːɹəʊ': 'kˈəʊkəɹəʊ',
'ʲ': 'j',
'r': 'ɹ',
'x': 'k',
'ɬ': 'l',
}
for old, new in replacements.items():
phonemes = phonemes.replace(old, new)
# Apply regex-based replacements
phonemes = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', phonemes)
phonemes = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', phonemes)
# Additional language-specific rules
if lang == 'a':
phonemes = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', phonemes)
# Filter out characters not in VOCAB
phonemes = ''.join(filter(lambda p: p in self.VOCAB, phonemes))
return phonemes.strip()
|