Spaces:
Running
Running
| # -*- coding: utf-8 -*- | |
| import re | |
| VALID_SYMBOLS = [ | |
| "AA", | |
| "AA0", | |
| "AA1", | |
| "AA2", | |
| "AE", | |
| "AE0", | |
| "AE1", | |
| "AE2", | |
| "AH", | |
| "AH0", | |
| "AH1", | |
| "AH2", | |
| "AO", | |
| "AO0", | |
| "AO1", | |
| "AO2", | |
| "AW", | |
| "AW0", | |
| "AW1", | |
| "AW2", | |
| "AY", | |
| "AY0", | |
| "AY1", | |
| "AY2", | |
| "B", | |
| "CH", | |
| "D", | |
| "DH", | |
| "EH", | |
| "EH0", | |
| "EH1", | |
| "EH2", | |
| "ER", | |
| "ER0", | |
| "ER1", | |
| "ER2", | |
| "EY", | |
| "EY0", | |
| "EY1", | |
| "EY2", | |
| "F", | |
| "G", | |
| "HH", | |
| "IH", | |
| "IH0", | |
| "IH1", | |
| "IH2", | |
| "IY", | |
| "IY0", | |
| "IY1", | |
| "IY2", | |
| "JH", | |
| "K", | |
| "L", | |
| "M", | |
| "N", | |
| "NG", | |
| "OW", | |
| "OW0", | |
| "OW1", | |
| "OW2", | |
| "OY", | |
| "OY0", | |
| "OY1", | |
| "OY2", | |
| "P", | |
| "R", | |
| "S", | |
| "SH", | |
| "T", | |
| "TH", | |
| "UH", | |
| "UH0", | |
| "UH1", | |
| "UH2", | |
| "UW", | |
| "UW0", | |
| "UW1", | |
| "UW2", | |
| "V", | |
| "W", | |
| "Y", | |
| "Z", | |
| "ZH", | |
| ] | |
| class CMUDict: | |
| """Thin wrapper around CMUDict data. http://www.speech.cs.cmu.edu/cgi-bin/cmudict""" | |
| def __init__(self, file_or_path, keep_ambiguous=True): | |
| if isinstance(file_or_path, str): | |
| with open(file_or_path, encoding="latin-1") as f: | |
| entries = _parse_cmudict(f) | |
| else: | |
| entries = _parse_cmudict(file_or_path) | |
| if not keep_ambiguous: | |
| entries = {word: pron for word, pron in entries.items() if len(pron) == 1} | |
| self._entries = entries | |
| def __len__(self): | |
| return len(self._entries) | |
| def lookup(self, word): | |
| """Returns list of ARPAbet pronunciations of the given word.""" | |
| return self._entries.get(word.upper()) | |
| def get_arpabet(word, cmudict, punctuation_symbols): | |
| first_symbol, last_symbol = "", "" | |
| if word and word[0] in punctuation_symbols: | |
| first_symbol = word[0] | |
| word = word[1:] | |
| if word and word[-1] in punctuation_symbols: | |
| last_symbol = word[-1] | |
| word = word[:-1] | |
| arpabet = cmudict.lookup(word) | |
| if arpabet is not None: | |
| return first_symbol + "{%s}" % arpabet[0] + last_symbol | |
| return first_symbol + word + last_symbol | |
| _alt_re = re.compile(r"\([0-9]+\)") | |
| def _parse_cmudict(file): | |
| cmudict = {} | |
| for line in file: | |
| if line and (line[0] >= "A" and line[0] <= "Z" or line[0] == "'"): | |
| parts = line.split(" ") | |
| word = re.sub(_alt_re, "", parts[0]) | |
| pronunciation = _get_pronunciation(parts[1]) | |
| if pronunciation: | |
| if word in cmudict: | |
| cmudict[word].append(pronunciation) | |
| else: | |
| cmudict[word] = [pronunciation] | |
| return cmudict | |
| def _get_pronunciation(s): | |
| parts = s.strip().split(" ") | |
| for part in parts: | |
| if part not in VALID_SYMBOLS: | |
| return None | |
| return " ".join(parts) | |