AlexK-PL commited on
Commit
dde58d5
·
1 Parent(s): cf8ab7d

Upload text processing files

Browse files
text/LICENSE ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2017 Keith Ito
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in
11
+ all copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19
+ THE SOFTWARE.
text/__init__.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ import re
3
+ from text import cleaners
4
+ from text.symbols import symbols
5
+
6
+
7
+ # Mappings from symbol to numeric ID and vice versa:
8
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
9
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
10
+
11
+ # Regular expression matching text enclosed in curly braces:
12
+ _curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
13
+
14
+
15
+ def text_to_sequence(text, cleaner_names):
16
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
17
+
18
+ The text can optionally have ARPAbet sequences enclosed in curly braces embedded
19
+ in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
20
+
21
+ Args:
22
+ text: string to convert to a sequence
23
+ cleaner_names: names of the cleaner functions to run the text through
24
+
25
+ Returns:
26
+ List of integers corresponding to the symbols in the text
27
+ '''
28
+ sequence = []
29
+
30
+ # Check for curly braces and treat their contents as ARPAbet:
31
+ while len(text):
32
+ m = _curly_re.match(text)
33
+ if not m:
34
+ sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
35
+ break
36
+ sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
37
+ sequence += _arpabet_to_sequence(m.group(2))
38
+ text = m.group(3)
39
+
40
+ # Append EOS token
41
+ sequence.append(_symbol_to_id['~'])
42
+ return sequence
43
+
44
+
45
+ def sequence_to_text(sequence):
46
+ '''Converts a sequence of IDs back to a string'''
47
+ result = ''
48
+ for symbol_id in sequence:
49
+ if symbol_id in _id_to_symbol:
50
+ s = _id_to_symbol[symbol_id]
51
+ # Enclose ARPAbet back in curly braces:
52
+ if len(s) > 1 and s[0] == '@':
53
+ s = '{%s}' % s[1:]
54
+ result += s
55
+ return result.replace('}{', ' ')
56
+
57
+
58
+ def _clean_text(text, cleaner_names):
59
+ for name in cleaner_names:
60
+ cleaner = getattr(cleaners, name)
61
+ if not cleaner:
62
+ raise Exception('Unknown cleaner: %s' % name)
63
+ text = cleaner(text)
64
+ return text
65
+
66
+
67
+ def _symbols_to_sequence(symbols):
68
+ return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
69
+
70
+
71
+ def _arpabet_to_sequence(text):
72
+ return _symbols_to_sequence(['@' + s for s in text.split()])
73
+
74
+
75
+ def _should_keep_symbol(s):
76
+ return s in _symbol_to_id and s is not '_' and s is not '~'
text/__init__.pyc ADDED
Binary file (3.28 kB). View file
 
text/__pycache__/__init__.cpython-35.pyc ADDED
Binary file (2.92 kB). View file
 
text/__pycache__/__init__.cpython-36.pyc ADDED
Binary file (2.72 kB). View file
 
text/__pycache__/__init__.cpython-37.pyc ADDED
Binary file (2.72 kB). View file
 
text/__pycache__/cleaners.cpython-35.pyc ADDED
Binary file (2.57 kB). View file
 
text/__pycache__/cleaners.cpython-36.pyc ADDED
Binary file (2.48 kB). View file
 
text/__pycache__/cleaners.cpython-37.pyc ADDED
Binary file (2.18 kB). View file
 
text/__pycache__/cmudict.cpython-35.pyc ADDED
Binary file (2.73 kB). View file
 
text/__pycache__/cmudict.cpython-36.pyc ADDED
Binary file (2.49 kB). View file
 
text/__pycache__/cmudict.cpython-37.pyc ADDED
Binary file (2.49 kB). View file
 
text/__pycache__/numbers.cpython-35.pyc ADDED
Binary file (2.48 kB). View file
 
text/__pycache__/numbers.cpython-36.pyc ADDED
Binary file (2.23 kB). View file
 
text/__pycache__/numbers.cpython-37.pyc ADDED
Binary file (2.23 kB). View file
 
text/__pycache__/symbols.cpython-35.pyc ADDED
Binary file (571 Bytes). View file
 
text/__pycache__/symbols.cpython-36.pyc ADDED
Binary file (552 Bytes). View file
 
text/__pycache__/symbols.cpython-37.pyc ADDED
Binary file (552 Bytes). View file
 
text/cleaners.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Cleaners are transformations that run over the input text at both training and eval time.
5
+
6
+ Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7
+ hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8
+ 1. "english_cleaners" for English text
9
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12
+ the symbols in symbols.py to match your data).
13
+ '''
14
+
15
+ import re
16
+ from unidecode import unidecode
17
+ from .numbers import normalize_numbers
18
+
19
+
20
+ # Regular expression matching whitespace:
21
+ _whitespace_re = re.compile(r'\s+')
22
+
23
+ # List of (regular expression, replacement) pairs for abbreviations:
24
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
25
+ ('mrs', 'misess'),
26
+ ('mr', 'mister'),
27
+ ('dr', 'doctor'),
28
+ ('st', 'saint'),
29
+ ('co', 'company'),
30
+ ('jr', 'junior'),
31
+ ('maj', 'major'),
32
+ ('gen', 'general'),
33
+ ('drs', 'doctors'),
34
+ ('rev', 'reverend'),
35
+ ('lt', 'lieutenant'),
36
+ ('hon', 'honorable'),
37
+ ('sgt', 'sergeant'),
38
+ ('capt', 'captain'),
39
+ ('esq', 'esquire'),
40
+ ('ltd', 'limited'),
41
+ ('col', 'colonel'),
42
+ ('ft', 'fort'),
43
+ ]]
44
+
45
+
46
+ def expand_abbreviations(text):
47
+ for regex, replacement in _abbreviations:
48
+ text = re.sub(regex, replacement, text)
49
+ return text
50
+
51
+
52
+ def expand_numbers(text):
53
+ return normalize_numbers(text)
54
+
55
+
56
+ def lowercase(text):
57
+ return text.lower()
58
+
59
+
60
+ def collapse_whitespace(text):
61
+ return re.sub(_whitespace_re, ' ', text)
62
+
63
+
64
+ def convert_to_ascii(text):
65
+ return unidecode(text)
66
+
67
+
68
+ def basic_cleaners(text):
69
+ '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
70
+ text = lowercase(text)
71
+ text = collapse_whitespace(text)
72
+ return text
73
+
74
+
75
+ def transliteration_cleaners(text):
76
+ '''Pipeline for non-English text that transliterates to ASCII.'''
77
+ text = convert_to_ascii(text)
78
+ text = lowercase(text)
79
+ text = collapse_whitespace(text)
80
+ return text
81
+
82
+
83
+ def english_cleaners(text):
84
+ '''Pipeline for English text, including number and abbreviation expansion.'''
85
+ text = convert_to_ascii(text)
86
+ text = lowercase(text)
87
+ text = expand_numbers(text)
88
+ text = expand_abbreviations(text)
89
+ text = collapse_whitespace(text)
90
+ return text
text/cleaners.pyc ADDED
Binary file (3.23 kB). View file
 
text/cmudict.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ import re
4
+
5
+
6
+ valid_symbols = [
7
+ 'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
8
+ 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
9
+ 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
10
+ 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
11
+ 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
12
+ 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
13
+ 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
14
+ ]
15
+
16
+ _valid_symbol_set = set(valid_symbols)
17
+
18
+
19
+ class CMUDict:
20
+ '''Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict'''
21
+ def __init__(self, file_or_path, keep_ambiguous=True):
22
+ if isinstance(file_or_path, str):
23
+ with open(file_or_path, encoding='latin-1') as f:
24
+ entries = _parse_cmudict(f)
25
+ else:
26
+ entries = _parse_cmudict(file_or_path)
27
+ if not keep_ambiguous:
28
+ entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
29
+ self._entries = entries
30
+
31
+
32
+ def __len__(self):
33
+ return len(self._entries)
34
+
35
+
36
+ def lookup(self, word):
37
+ '''Returns list of ARPAbet pronunciations of the given word.'''
38
+ return self._entries.get(word.upper())
39
+
40
+
41
+
42
+ _alt_re = re.compile(r'\([0-9]+\)')
43
+
44
+
45
+ def _parse_cmudict(file):
46
+ cmudict = {}
47
+ for line in file:
48
+ if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
49
+ parts = line.split(' ')
50
+ word = re.sub(_alt_re, '', parts[0])
51
+ pronunciation = _get_pronunciation(parts[1])
52
+ if pronunciation:
53
+ if word in cmudict:
54
+ cmudict[word].append(pronunciation)
55
+ else:
56
+ cmudict[word] = [pronunciation]
57
+ return cmudict
58
+
59
+
60
+ def _get_pronunciation(s):
61
+ parts = s.strip().split(' ')
62
+ for part in parts:
63
+ if part not in _valid_symbol_set:
64
+ return None
65
+ return ' '.join(parts)
text/numbers.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import inflect
5
+ import re
6
+
7
+
8
+ _inflect = inflect.engine()
9
+ _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
10
+ _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
11
+ _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
12
+ _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
13
+ _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
14
+ _number_re = re.compile(r'[0-9]+')
15
+
16
+
17
+ def _remove_commas(m):
18
+ return m.group(1).replace(',', '')
19
+
20
+
21
+ def _expand_decimal_point(m):
22
+ return m.group(1).replace('.', ' point ')
23
+
24
+
25
+ def _expand_dollars(m):
26
+ match = m.group(1)
27
+ parts = match.split('.')
28
+ if len(parts) > 2:
29
+ return match + ' dollars' # Unexpected format
30
+ dollars = int(parts[0]) if parts[0] else 0
31
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
32
+ if dollars and cents:
33
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
34
+ cent_unit = 'cent' if cents == 1 else 'cents'
35
+ return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
36
+ elif dollars:
37
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
38
+ return '%s %s' % (dollars, dollar_unit)
39
+ elif cents:
40
+ cent_unit = 'cent' if cents == 1 else 'cents'
41
+ return '%s %s' % (cents, cent_unit)
42
+ else:
43
+ return 'zero dollars'
44
+
45
+
46
+ def _expand_ordinal(m):
47
+ return _inflect.number_to_words(m.group(0))
48
+
49
+
50
+ def _expand_number(m):
51
+ num = int(m.group(0))
52
+ if num > 1000 and num < 3000:
53
+ if num == 2000:
54
+ return 'two thousand'
55
+ elif num > 2000 and num < 2010:
56
+ return 'two thousand ' + _inflect.number_to_words(num % 100)
57
+ elif num % 100 == 0:
58
+ return _inflect.number_to_words(num // 100) + ' hundred'
59
+ else:
60
+ return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
61
+ else:
62
+ return _inflect.number_to_words(num, andword='')
63
+
64
+
65
+ def normalize_numbers(text):
66
+ text = re.sub(_comma_number_re, _remove_commas, text)
67
+ text = re.sub(_pounds_re, r'\1 pounds', text)
68
+ text = re.sub(_dollars_re, _expand_dollars, text)
69
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
70
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
71
+ text = re.sub(_number_re, _expand_number, text)
72
+ return text
text/symbols.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Defines the set of symbols used in text input to the model.
5
+
6
+ The default is a set of ASCII characters that works well for English or text that has been run through Unidecode. For other data, you can modify _characters. See TRAINING_DATA.md for details. '''
7
+ from text import cmudict
8
+
9
+ _pad = '_'
10
+ _eos = '~'
11
+ _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
12
+
13
+ # Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
14
+ _arpabet = ['@' + s for s in cmudict.valid_symbols]
15
+
16
+ # Export all symbols:
17
+ symbols = [_pad, _eos] + list(_characters) + _arpabet