File size: 8,904 Bytes
1337d7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import re
from phonemizer import backend
from typing import List


class Tokenizer:
    def __init__(self):
        self.VOCAB = self._get_vocab()
        self.phonemizers = {
            'en-us': backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True),
            'en-gb': backend.EspeakBackend(language='en-gb', preserve_punctuation=True, with_stress=True),
        }

    @staticmethod
    def _get_vocab():
        """
        Generates a mapping of symbols to integer indices for tokenization.

        Returns:
            dict: A dictionary where keys are symbols and values are unique integer indices.
        """
        # Define the symbols
        _pad = "$"
        _punctuation = ';:,.!?¡¿—…"«»“” '
        _letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
        _letters_ipa = (
            "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
        )
        symbols = [_pad] + list(_punctuation) + list(_letters) + list(_letters_ipa)

        # Create a dictionary mapping each symbol to its index
        return {symbol: index for index, symbol in enumerate(symbols)}

    @staticmethod
    def split_num(num: re.Match) -> str:
        """
        Processes numeric strings, formatting them as time, years, or other representations.

        Args:
            num (re.Match): A regex match object representing the numeric string.

        Returns:
            str: A formatted string based on the numeric input.
        """
        num = num.group()

        # Handle time (e.g., "12:30")
        if ':' in num:
            hours, minutes = map(int, num.split(':'))
            if minutes == 0:
                return f"{hours} o'clock"
            elif minutes < 10:
                return f'{hours} oh {minutes}'
            return f'{hours} {minutes}'

        # Handle years or general numeric cases
        year = int(num[:4])
        if year < 1100 or year % 1000 < 10:
            return num

        left, right = num[:2], int(num[2:4])
        suffix = 's' if num.endswith('s') else ''

        # Format years
        if 100 <= year % 1000 <= 999:
            if right == 0:
                return f'{left} hundred{suffix}'
            elif right < 10:
                return f'{left} oh {right}{suffix}'
        return f'{left} {right}{suffix}'

    @staticmethod
    def flip_money(match: re.Match) -> str:
        """
        Converts monetary values to a textual representation.

        Args:
            m (re.Match): A regex match object representing the monetary value.

        Returns:
            str: A formatted string describing the monetary value.
        """
        m = m.group()
        currency = 'dollar' if m[0] == '$' else 'pound'

        # Handle whole amounts (e.g., "$10", "£20")
        if '.' not in m:
            singular = '' if m[1:] == '1' else 's'
            return f'{m[1:]} {currency}{singular}'

        # Handle amounts with decimals (e.g., "$10.50", "£5.25")
        whole, cents = m[1:].split('.')
        singular = '' if whole == '1' else 's'
        cents = int(cents.ljust(2, '0'))  # Ensure 2 decimal places
        coins = f"cent{'' if cents == 1 else 's'}" if m[0] == '$' else ('penny' if cents == 1 else 'pence')
        return f'{whole} {currency}{singular} and {cents} {coins}'

    @staticmethod
    def point_num(match):
        whole, fractional = match.group().split('.')
        return ' point '.join([whole, ' '.join(fractional)])

    def normalize_text(self, text: str) -> str:
        """
        Normalizes input text by replacing special characters, punctuation, and applying custom transformations.

        Args:
            text (str): Input text to normalize.

        Returns:
            str: Normalized text.
        """
        # Replace specific characters with standardized versions
        replacements = {
            chr(8216): "'",  # Left single quotation mark
            chr(8217): "'",  # Right single quotation mark
            '«': chr(8220),  # Left double angle quotation mark to left double quotation mark
            '»': chr(8221),  # Right double angle quotation mark to right double quotation mark
            chr(8220): '"',  # Left double quotation mark
            chr(8221): '"',  # Right double quotation mark
            '(': '«',        # Replace parentheses with angle quotation marks
            ')': '»'
        }
        for old, new in replacements.items():
            text = text.replace(old, new)

        # Replace punctuation and add spaces
        punctuation_replacements = {
            '、': ',',
            '。': '.',
            '!': '!',
            ',': ',',
            ':': ':',
            ';': ';',
            '?': '?',
        }
        for old, new in punctuation_replacements.items():
            text = text.replace(old, new + ' ')

        # Apply regex-based replacements
        text = re.sub(r'[^\S\n]', ' ', text)
        text = re.sub(r'  +', ' ', text)
        text = re.sub(r'(?<=\n) +(?=\n)', '', text)

        # Expand abbreviations and handle special cases
        abbreviation_patterns = [
            (r'\bD[Rr]\.(?= [A-Z])', 'Doctor'),
            (r'\b(?:Mr\.|MR\.(?= [A-Z]))', 'Mister'),
            (r'\b(?:Ms\.|MS\.(?= [A-Z]))', 'Miss'),
            (r'\b(?:Mrs\.|MRS\.(?= [A-Z]))', 'Mrs'),
            (r'\betc\.(?! [A-Z])', 'etc'),
            (r'(?i)\b(y)eah?\b', r"\1e'a"),
        ]
        for pattern, replacement in abbreviation_patterns:
            text = re.sub(pattern, replacement, text)

        # Handle numbers and monetary values
        text = re.sub(r'\d*\.\d+|\b\d{4}s?\b|(?<!:)\b(?:[1-9]|1[0-2]):[0-5]\d\b(?!:)', self.split_num, text)
        text = re.sub(r'(?<=\d),(?=\d)', '', text)  # Remove commas from numbers
        text = re.sub(
            r'(?i)[$£]\d+(?:\.\d+)?(?: hundred| thousand| (?:[bm]|tr)illion)*\b|[$£]\d+\.\d\d?\b',
            self.flip_money,
            text
        )
        text = re.sub(r'\d*\.\d+', self.point_num, text)
        text = re.sub(r'(?<=\d)-(?=\d)', ' to ', text)

        # Handle possessives and specific letter cases
        text = re.sub(r'(?<=\d)S', ' S', text)
        text = re.sub(r"(?<=[BCDFGHJ-NP-TV-Z])'?s\b", "'S", text)
        text = re.sub(r"(?<=X')S\b", 's', text)

        # Handle abbreviations with dots
        text = re.sub(r'(?:[A-Za-z]\.){2,} [a-z]', lambda m: m.group().replace('.', '-'), text)
        text = re.sub(r'(?i)(?<=[A-Z])\.(?=[A-Z])', '-', text)

        return text.strip()

    def tokenize(self, phonemes: str) -> List[int]:
        """
        Tokenizes a given string into a list of indices based on VOCAB.

        Args:
            text (str): Input string to tokenize.

        Returns:
            list: A list of integer indices corresponding to the characters in the input string.
        """
        return [self.VOCAB[x] for x in phonemes if x in self.VOCAB]

    def phonemize(self, text: str, lang: str = 'en-us', normalize: bool = True) -> str:
        """
        Converts text to phonemes using the specified language phonemizer and applies normalization.

        Args:
            text (str): Input text to be phonemized.
            lang (str): Language identifier ('en-us' or 'en-gb') for selecting the phonemizer.
            normalize (bool): Whether to normalize the text before phonemization.

        Returns:
            str: A processed string of phonemes.
        """
        # Normalize text if required
        if normalize:
            text = self.normalize_text(text)

        # Generate phonemes using the specified phonemizer
        if lang not in self.phonemizers:
            print(f"Language '{lang}' not supported. Defaulting to 'en-us'.")
            lang = 'en-us'

        phonemes = self.phonemizers[lang].phonemize([text])
        phonemes = phonemes[0] if phonemes else ''

        # Apply custom phoneme replacements
        replacements = {
            'kəkˈoːɹoʊ': 'kˈoʊkəɹoʊ',
            'kəkˈɔːɹəʊ': 'kˈəʊkəɹəʊ',
            'ʲ': 'j',
            'r': 'ɹ',
            'x': 'k',
            'ɬ': 'l',
        }
        for old, new in replacements.items():
            phonemes = phonemes.replace(old, new)

        # Apply regex-based replacements
        phonemes = re.sub(r'(?<=[a-zɹː])(?=hˈʌndɹɪd)', ' ', phonemes)
        phonemes = re.sub(r' z(?=[;:,.!?¡¿—…"«»“” ]|$)', 'z', phonemes)

        # Additional language-specific rules
        if lang == 'a':
            phonemes = re.sub(r'(?<=nˈaɪn)ti(?!ː)', 'di', phonemes)

        # Filter out characters not in VOCAB
        phonemes = ''.join(filter(lambda p: p in self.VOCAB, phonemes))

        return phonemes.strip()