NikiPshg commited on Aug 5, 2024

Commit

9ba7d3b

verified ·

1 Parent(s): 673d6e3

Upload 27 files

Browse files

Files changed (27) hide show

.gitignore +3 -0
G2P_lexicon/G2P.py +87 -0
G2P_lexicon/G2P_en_lexicon.py +70 -0
G2P_lexicon/SP.py +88 -0
G2P_lexicon/__init__.py +1 -0
G2P_lexicon/__pycache__/G2P.cpython-311.pyc +0 -0
G2P_lexicon/__pycache__/G2P_en_lexicon.cpython-311.pyc +0 -0
G2P_lexicon/__pycache__/SP.cpython-311.pyc +0 -0
G2P_lexicon/__pycache__/__init__.cpython-311.pyc +0 -0
G2P_lexicon/__pycache__/config_models.cpython-311.pyc +0 -0
G2P_lexicon/__pycache__/data_preparation.cpython-311.pyc +0 -0
G2P_lexicon/__pycache__/sp_tokenizer.cpython-311.pyc +0 -0
G2P_lexicon/__pycache__/transformer.cpython-311.pyc +0 -0
G2P_lexicon/config_models.py +15 -0
G2P_lexicon/data/word2phoneme.json +0 -0
G2P_lexicon/data_preparation.py +75 -0
G2P_lexicon/models/.gitignore +2 -0
G2P_lexicon/models/model0.07.pt +3 -0
G2P_lexicon/models/model_0.159.pt +3 -0
G2P_lexicon/my_tokenizer/bpe_512_lex.json +1042 -0
G2P_lexicon/my_tokenizer/my_dict_256.json +90 -0
G2P_lexicon/sp_tokenizer.py +87 -0
G2P_lexicon/transformer.py +167 -0
LICENSE +21 -0
README.md +45 -3
requiremenst.txt +0 -0
test.py +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv
+.idea
+**/__pycache__/

G2P_lexicon/G2P.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+from G2P_lexicon.transformer import TransformerBlock
+from tokenizers import Tokenizer
+from G2P_lexicon.config_models import config_g2p
+import os
+dirname = os.path.dirname(__file__)
+def decode_form_G(tokens: str):
+    """
+    Converts model output to a readable format.
+    Args:
+        tokens: 'NĠAH1ĠMĠBĠER0ĠZ'
+    Returns:
+        ['N', 'AH1', 'M', 'B', 'ER0', 'Z']
+    """
+    return ''.join(tokens).split('Ġ')
+class GraphemeToPhoneme:
+    def __init__(self,
+                 model,
+                 tokenizer):
+        self.g2p_model = model
+        self.tokenizer = tokenizer
+        self.g2p_model.eval()
+    def greedy_decode_grapheme(self, model,
+                               src,
+                               src_mask,
+                               max_len,
+                               start_token):
+        src = src.unsqueeze(0)
+        src_mask = src_mask.unsqueeze(0)
+        input_decoder = model.encode(src, src_mask)
+        label = torch.zeros(1, 1).fill_(start_token).type_as(src.data)
+        for _ in range(max_len - 1):
+            tgt_mask = (torch.tril(torch.ones((label.size(1), label.size(1)))).type_as(src.data)).unsqueeze(0)
+            out = model.decode(input_decoder, src_mask, label, tgt_mask)
+            prob = model.fc_out(out[:, -1])
+            _, next_word = torch.max(prob, dim=1)
+            next_word = next_word.item()
+            label = torch.cat([label, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
+            if next_word == self.tokenizer.encode("<eos>").ids[0]:
+                break
+        pred = decode_form_G(self.tokenizer.decode(label[0].tolist()))
+        return pred
+    def __call__(self, srs):
+        with torch.no_grad():
+            enc_input_tokens = self.tokenizer.encode(srs).ids
+            pad_id = self.tokenizer.encode("<pad>").ids[0]
+            enc_num_padding_tokens = 32 - len(enc_input_tokens) - 2
+            encoder_input = torch.cat([
+                torch.tensor([self.tokenizer.encode("<bos>").ids[0]]),
+                torch.tensor(enc_input_tokens),
+                torch.tensor([self.tokenizer.encode("<eos>").ids[0]]),
+                torch.tensor([pad_id] * enc_num_padding_tokens)
+            ], dim=0)
+            encoder_mask = (encoder_input != pad_id).unsqueeze(0).unsqueeze(0).int()
+            pred = self.greedy_decode_grapheme(
+                model=self.g2p_model,
+                src=encoder_input,
+                src_mask=encoder_mask,
+                max_len=32,
+                start_token=self.tokenizer.encode("<bos>").ids[0]
+            )
+        return pred
+dict_path = os.path.join(dirname, "my_tokenizer/bpe_512_lex.json")
+model_path = os.path.join(dirname, "models/model0.07.pt")
+tokenizer_g2p = Tokenizer.from_file(dict_path)
+g2p_model = TransformerBlock(config=config_g2p, tokenizer=tokenizer_g2p)
+g2p_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
+G2P = GraphemeToPhoneme(g2p_model, tokenizer_g2p)
+if __name__ == '__main__':
+    print(G2P('NIKITA'))  # Expected output:['N', 'IH', 'K', 'IY', 'T', 'AH']

G2P_lexicon/G2P_en_lexicon.py ADDED Viewed

	@@ -0,0 +1,70 @@

+from G2P_lexicon.G2P import G2P
+from G2P_lexicon.SP import SP
+from G2P_lexicon.data_preparation import preprocess_text
+import string
+import json
+import time
+import os
+dirname = os.path.dirname(__file__)
+json_path = os.path.join(dirname, "data/word2phoneme.json")
+with open(json_path) as json_file:
+    phoneme2grapheme_dict = json.load(json_file)
+class g2p_en_lexicon:
+    def __init__(self):
+        self.G2P = G2P
+        self.SP = SP
+    def cleaan_stress(self, seq: list):
+        return [phoneme[:-1] if phoneme[-1].isdigit() else phoneme for phoneme in seq]
+    def pred_with_stress(self, seq):
+        return self.SP(self.G2P(seq))
+    def check_punctuation(self, word):
+        return any(char in string.punctuation for char in word)
+    def __call__(self, seq, with_stress=True):
+        seq_list = preprocess_text(seq)
+        result = []
+        count_from_dict = 0
+        count_from_model = 0
+        for word in seq_list:
+            phonemes_from_dict = phoneme2grapheme_dict.get(word)
+            if phonemes_from_dict is None:
+                if self.check_punctuation(word):
+                    result.extend([word] + [' '])
+                else:
+                    count_from_model += 1
+                    if with_stress:
+                        pred_stress = self.pred_with_stress(word)
+                        #print(f"{word} -- {pred_stress}")
+                        result.extend(pred_stress + [' '])
+                    else:
+                        pred_without = self.G2P(word)
+                        #print(f"{word} -- {pred_without}")
+                        result.extend(pred_without + [' '])
+            else:
+                count_from_dict += 1
+                result.extend(phonemes_from_dict + [' '])
+        #print(f"{count_from_dict} -- from json\n"
+              #f"{count_from_model} -- from model")
+        result = result[:-1] if result[-1] == ' ' else result
+        if not with_stress:
+            return self.cleaan_stress(result)
+        return result
+if __name__ == '__main__':
+    G2P_en_lexicon = g2p_en_lexicon()
+    text = """mtusi is the worst option for a programmer or a student"""
+    start_time = time.time()
+    print(G2P_en_lexicon(text))
+    end_time = time.time()
+    print(f"{(end_time - start_time) * 1000} мc -- за это была выполнена ")

G2P_lexicon/SP.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from G2P_lexicon.config_models import config_sp
+from G2P_lexicon.transformer import TransformerBlock
+from G2P_lexicon.sp_tokenizer import Tokenizer_sp
+import torch
+import os
+dirname = os.path.dirname(__file__)
+class Stress_Pred:
+    def __init__(self,
+                 model,
+                 tokenizer):
+        self.SP = model
+        self.tokenizer = tokenizer
+        self.SP.eval()
+    def __call__(self, srs):
+        with torch.no_grad():
+            enc_input_tokens = self.tokenizer.encode(srs)
+            pad_id = torch.tensor(self.tokenizer.pad_idx)
+            enc_num_padding_tokens = 32 - len(enc_input_tokens)
+            encoder_input = torch.cat(
+                [
+                    torch.tensor(enc_input_tokens),
+                    pad_id.repeat(enc_num_padding_tokens)
+                ],
+                dim=0)
+            encoder_mask = (encoder_input != pad_id).unsqueeze(0).unsqueeze(0).int()
+            label = self.greedy_decode_stress(
+                src=encoder_input,
+                src_mask=encoder_mask,
+                start_token=self.tokenizer.sos_idx,
+            )
+        return label
+    def greedy_decode_stress(self,
+                             src,
+                             src_mask,
+                             start_token):
+        len_src = (src != 3).int().sum().item()
+        index_vowels = torch.tensor([(idx) for (idx, i) in enumerate(src) if not (i in list_tokens_without_stress)])[
+                       :len_src]
+        src = src.unsqueeze(0)
+        src_mask = src_mask.unsqueeze(0)
+        input_decoder = self.SP.encode(src, src_mask)
+        label = torch.tensor([]).type_as(src.data)
+        for idx in range(len_src):
+            if idx in index_vowels:
+                label = torch.cat([label, torch.ones(1, 1).type_as(src.data).fill_(src[0][idx])], dim=1)
+            else:
+                tgt_mask = (torch.tril(torch.ones((label.size(1), label.size(1)))).type_as(src.data)).unsqueeze(0)
+                out = self.SP.decode(input_decoder, src_mask, label, tgt_mask)
+                prob = self.SP.fc_out(out[:, -1])
+                _, next_word = torch.max(prob, dim=1)
+                next_word = next_word.data[0]
+                label = torch.cat([label, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
+        pred = self.tokenizer.decode(label[0].tolist())[1:-1]
+        return pred
+dict_path = os.path.join(dirname, "my_tokenizer\my_dict_256.json")
+model_path = os.path.join(dirname, "models\model_0.159.pt")
+tokenizer_sp = Tokenizer_sp(dict_path=dict_path)
+set_tokens_without_stress = set()
+for token, phoneme in tokenizer_sp.idx2token.items():
+    if phoneme[-1].isdigit():
+        set_tokens_without_stress.add(tokenizer_sp.token2idx[phoneme[:-1]])
+list_tokens_without_stress = list(set_tokens_without_stress)
+sp_model = TransformerBlock(config=config_sp,
+                            tokenizer=tokenizer_sp)
+sp_model.load_state_dict(
+    torch.load(model_path, map_location=torch.device('cpu')))
+SP = Stress_Pred(model=sp_model,
+                 tokenizer=tokenizer_sp)
+if __name__ == '__main__':
+    print(SP(['N', 'IH', 'K', 'IY', 'T', 'AH']))  #['N', 'IH2', 'K', 'IY1', 'T', 'AH0']

G2P_lexicon/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from G2P_lexicon.G2P_en_lexicon import g2p_en_lexicon

G2P_lexicon/__pycache__/G2P.cpython-311.pyc ADDED Viewed

Binary file (5.97 kB). View file

G2P_lexicon/__pycache__/G2P_en_lexicon.cpython-311.pyc ADDED Viewed

Binary file (4.38 kB). View file

G2P_lexicon/__pycache__/SP.cpython-311.pyc ADDED Viewed

Binary file (6.28 kB). View file

G2P_lexicon/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (233 Bytes). View file

G2P_lexicon/__pycache__/config_models.cpython-311.pyc ADDED Viewed

Binary file (343 Bytes). View file

G2P_lexicon/__pycache__/data_preparation.cpython-311.pyc ADDED Viewed

Binary file (3.31 kB). View file

G2P_lexicon/__pycache__/sp_tokenizer.cpython-311.pyc ADDED Viewed

Binary file (6.76 kB). View file

G2P_lexicon/__pycache__/transformer.cpython-311.pyc ADDED Viewed

Binary file (13.7 kB). View file

G2P_lexicon/config_models.py ADDED Viewed

	@@ -0,0 +1,15 @@

+config_sp = {
+    "D_MODEL":  256,
+    "D_FF": 1024,
+    "NUM": 3,
+    "NUM_HEADS": 4,
+    "MAX_LEN": 32,
+    }
+config_g2p = {
+        "D_MODEL":  512,
+        "D_FF": 2048,
+        "NUM": 6,
+        "NUM_HEADS": 8,
+        "MAX_LEN": 32,
+        }

G2P_lexicon/data/word2phoneme.json ADDED Viewed

The diff for this file is too large to render. See raw diff

G2P_lexicon/data_preparation.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import re
+def intToWord(number):
+    ones = ("", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine")
+    tens = ("", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety")
+    teens = (
+        "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen")
+    levels = (
+        "", "thousand", "million", "billion", "trillion", "quadrillion", "quintillion", "sextillion", "septillion",
+        "octillion", "nonillion")
+    word = ""
+    num = reversed(str(number))
+    number = ""
+    for x in num:
+        number += x
+    del num
+    if len(number) % 3 == 1: number += "0"
+    x = 0
+    for digit in number:
+        if x % 3 == 0:
+            word = levels[x // 3] + " " + word
+            n = int(digit)
+        elif x % 3 == 1:
+            if digit == "1":
+                num = teens[n]
+            else:
+                num = tens[int(digit)]
+                if n:
+                    if num:
+                        num +=  ones[n]
+                    else:
+                        num = ones[n]
+            word = num + " " + word
+        elif x % 3 == 2:
+            if digit != "0":
+                word = ones[int(digit)] + " hundred " + word
+        x += 1
+    return word.strip(" ")
+def preprocess_text(text):
+    """
+    Приведение к нормальному виду с отделенными точками и запятыми
+    srs:
+        Hello, World! This is a sample text with numbers 12345 and symbols #$%.
+    return:
+        ['HELLO', ',', 'WORLD', 'THIS', 'IS', 'A', 'SAMPLE', 'TEXT', 'WITH', 'NUMBERS', 'AND', 'SYMBOLS', '.']
+    """
+    if not(text.isspace()) and text and text:
+        text = text.upper()
+        text = re.sub(r'([.,])', r' \1 ', text)
+        text = re.sub(r'[^A-Z .,^0-9]', '', text)
+        text = re.sub(r'\s+', ' ', text).strip()
+        text = text.split()
+        result = []
+        for word in text:
+            if word.isdigit():
+                result = result + (intToWord(word).upper()).split()
+            else:
+                result.append(word)
+    else:
+        result = ['текст введи :(']
+    return result
+if __name__ == "__main__":
+    sample_text = "Hello, World! This is a sample text with numbers 12345 and symbols #$%."
+    processed_text = preprocess_text(sample_text)
+    print("Processed text:", processed_text)

G2P_lexicon/models/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ model_0.159.pt
2	+ model0.07.pt

G2P_lexicon/models/model0.07.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7bb918136264dac82f564a0fe07964607e9494701a8018eaa5c2be16c6bd89a
+size 179866303

G2P_lexicon/models/model_0.159.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ce2f8269e96abaf00086f4c61043046656deb8cf397ce7f1501d2f354dd6bea7
+size 22471914

G2P_lexicon/my_tokenizer/bpe_512_lex.json ADDED Viewed

	@@ -0,0 +1,1042 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<unk>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 512,
+      "content": "<pad>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 513,
+      "content": "<bos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 514,
+      "content": "<eos>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "BertNormalizer",
+    "clean_text": true,
+    "handle_chinese_chars": true,
+    "strip_accents": null,
+    "lowercase": false
+  },
+  "pre_tokenizer": {
+    "type": "BertPreTokenizer"
+  },
+  "post_processor": null,
+  "decoder": {
+    "type": "BPEDecoder",
+    "suffix": "</w>"
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<unk>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": "</w>",
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {
+      "<unk>": 0,
+      "A": 1,
+      "B": 2,
+      "C": 3,
+      "D": 4,
+      "E": 5,
+      "F": 6,
+      "G": 7,
+      "H": 8,
+      "I": 9,
+      "J": 10,
+      "K": 11,
+      "L": 12,
+      "M": 13,
+      "N": 14,
+      "O": 15,
+      "P": 16,
+      "Q": 17,
+      "R": 18,
+      "S": 19,
+      "T": 20,
+      "U": 21,
+      "V": 22,
+      "W": 23,
+      "X": 24,
+      "Y": 25,
+      "Z": 26,
+      "Ġ": 27,
+      "G</w>": 28,
+      "E</w>": 29,
+      "A</w>": 30,
+      "D</w>": 31,
+      "S</w>": 32,
+      "L</w>": 33,
+      "Z</w>": 34,
+      "N</w>": 35,
+      "Y</w>": 36,
+      "T</w>": 37,
+      "H</w>": 38,
+      "R</w>": 39,
+      "F</w>": 40,
+      "W</w>": 41,
+      "P</w>": 42,
+      "V</w>": 43,
+      "M</w>": 44,
+      "K</w>": 45,
+      "U</w>": 46,
+      "B</w>": 47,
+      "I</w>": 48,
+      "C</w>": 49,
+      "O</w>": 50,
+      "X</w>": 51,
+      "J</w>": 52,
+      "Q</w>": 53,
+      "HĠ": 54,
+      "ĠA": 55,
+      "ĠI": 56,
+      "ĠAHĠ": 57,
+      "ĠE": 58,
+      "ĠIHĠ": 59,
+      "YĠ": 60,
+      "RĠ": 61,
+      "NĠ": 62,
+      "AĠ": 63,
+      "WĠ": 64,
+      "EĠ": 65,
+      "ĠAAĠ": 66,
+      "ĠEHĠ": 67,
+      "SĠ": 68,
+      "ĠAEĠ": 69,
+      "IN": 70,
+      "ĠR": 71,
+      "ER": 72,
+      "ĠIYĠ": 73,
+      "LĠ": 74,
+      "KĠ": 75,
+      "ĠERĠ": 76,
+      "HĠA": 77,
+      "ĠIY</w>": 78,
+      "OWĠ": 79,
+      "AN": 80,
+      "AR": 81,
+      "ĠEYĠ": 82,
+      "OĠ": 83,
+      "TĠ": 84,
+      "EN": 85,
+      "ON": 86,
+      "GĠ": 87,
+      "ĠAHĠNĠ": 88,
+      "ES</w>": 89,
+      "ĠAOĠ": 90,
+      "AT": 91,
+      "UWĠ": 92,
+      "ĠIHĠN": 93,
+      "OR": 94,
+      "MĠ": 95,
+      "IHĠ": 96,
+      "AL": 97,
+      "ĠAYĠ": 98,
+      "EL": 99,
+      "ĠAH</w>": 100,
+      "ST": 101,
+      "DĠ": 102,
+      "ED</w>": 103,
+      "CH": 104,
+      "ĠER</w>": 105,
+      "HĠAHĠ": 106,
+      "ĠAHĠN</w>": 107,
+      "RE": 108,
+      "SĠT": 109,
+      "ĠIY": 110,
+      "ER</w>": 111,
+      "IL": 112,
+      "EHĠ": 113,
+      "IS": 114,
+      "PĠ": 115,
+      "IT": 116,
+      "BĠ": 117,
+      "OU": 118,
+      "ES": 119,
+      "ING</w>": 120,
+      "ĠIHĠNG</w>": 121,
+      "LĠAHĠ": 122,
+      "HHĠ": 123,
+      "AHĠ": 124,
+      "ET": 125,
+      "OW</w>": 126,
+      "OL": 127,
+      "NĠAHĠ": 128,
+      "RA": 129,
+      "IC": 130,
+      "JHĠ": 131,
+      "TĠIHĠ": 132,
+      "ĠER": 133,
+      "LĠIY</w>": 134,
+      "LĠIHĠ": 135,
+      "UN": 136,
+      "ĠAAĠRĠ": 137,
+      "RO": 138,
+      "KĠAHĠ": 139,
+      "SĠIHĠ": 140,
+      "TĠS</w>": 141,
+      "ĠERĠZ</w>": 142,
+      "ON</w>": 143,
+      "HHĠA": 144,
+      "LA": 145,
+      "ĠAOĠRĠ": 146,
+      "DĠIHĠ": 147,
+      "SHĠ": 148,
+      "ĠIYĠZ</w>": 149,
+      "SĠAHĠ": 150,
+      "MA": 151,
+      "RI": 152,
+      "CHĠ": 153,
+      "ERS</w>": 154,
+      "LE": 155,
+      "NĠZ</w>": 156,
+      "TĠAHĠ": 157,
+      "SHĠAHĠ": 158,
+      "UR": 159,
+      "FĠ": 160,
+      "UHĠ": 161,
+      "AM": 162,
+      "ĠAEĠNĠ": 163,
+      "SH": 164,
+      "AEĠ": 165,
+      "ĠEHĠR": 166,
+      "ING": 167,
+      "ĠAAĠNĠ": 168,
+      "LI": 169,
+      "ĠAHĠNĠZ</w>": 170,
+      "ĠRĠAHĠ": 171,
+      "LĠZ</w>": 172,
+      "TH": 173,
+      "TI": 174,
+      "DĠAHĠ": 175,
+      "ĠEHĠNĠ": 176,
+      "BĠAHĠ": 177,
+      "CO": 178,
+      "MĠAHĠ": 179,
+      "LY</w>": 180,
+      "ĠIHĠNGĠ": 181,
+      "DE": 182,
+      "ĠRĠIHĠ": 183,
+      "LO": 184,
+      "MĠIHĠ": 185,
+      "NG��": 186,
+      "UL": 187,
+      "NĠIHĠ": 188,
+      "ONS</w>": 189,
+      "AB": 190,
+      "THĠ": 191,
+      "WĠIHĠ": 192,
+      "AD": 193,
+      "IN</w>": 194,
+      "QU": 195,
+      "AAĠ": 196,
+      "AS": 197,
+      "KĠAAĠ": 198,
+      "ZĠ": 199,
+      "DĠZ</w>": 200,
+      "IR": 201,
+      "PĠAHĠ": 202,
+      "SĠT</w>": 203,
+      "AC": 204,
+      "RU": 205,
+      "ATI": 206,
+      "EM": 207,
+      "SĠK": 208,
+      "ĠAWĠ": 209,
+      "KĠS</w>": 210,
+      "ED": 211,
+      "LE</w>": 212,
+      "LĠAEĠ": 213,
+      "TS</w>": 214,
+      "YS</w>": 215,
+      "IYĠ": 216,
+      "VĠIHĠ": 217,
+      "EN</w>": 218,
+      "OW": 219,
+      "PĠR": 220,
+      "VĠ": 221,
+      "YĠUWĠ": 222,
+      "ERĠ": 223,
+      "OO": 224,
+      "AG": 225,
+      "US</w>": 226,
+      "KĠAEĠ": 227,
+      "RĠAHĠ": 228,
+      "CON": 229,
+      "ID": 230,
+      "BĠERĠ": 231,
+      "MĠAEĠ": 232,
+      "ĠAO": 233,
+      "IG": 234,
+      "LĠIYĠ": 235,
+      "YĠAHĠ": 236,
+      "SĠEHĠ": 237,
+      "OWĠZ</w>": 238,
+      "BĠIHĠ": 239,
+      "ĠIYĠAHĠ": 240,
+      "IM": 241,
+      "GĠR": 242,
+      "OM": 243,
+      "SE": 244,
+      "AND": 245,
+      "AS</w>": 246,
+      "LĠEHĠ": 247,
+      "AU": 248,
+      "IHĠNĠ": 249,
+      "CK": 250,
+      "TĠR": 251,
+      "PP": 252,
+      "ESS</w>": 253,
+      "LĠEYĠ": 254,
+      "VER": 255,
+      "GĠAHĠ": 256,
+      "TĠIY</w>": 257,
+      "AHĠNĠ": 258,
+      "FĠAHĠ": 259,
+      "LĠAYĠ": 260,
+      "LĠAAĠ": 261,
+      "TH</w>": 262,
+      "RĠIYĠ": 263,
+      "SU": 264,
+      "AN</w>": 265,
+      "RĠIHĠ": 266,
+      "SC": 267,
+      "RĠOWĠ": 268,
+      "KĠW": 269,
+      "BO": 270,
+      "FĠIHĠ": 271,
+      "SĠP": 272,
+      "PH": 273,
+      "SP": 274,
+      "AV": 275,
+      "FF": 276,
+      "BĠR": 277,
+      "ĠEYĠSHĠAHĠ": 278,
+      "OWĠLĠ": 279,
+      "AL</w>": 280,
+      "ĠAHĠNĠT</w>": 281,
+      "KĠR": 282,
+      "MĠEHĠ": 283,
+      "HHĠAAĠ": 284,
+      "DI": 285,
+      "OYĠ": 286,
+      "SH</w>": 287,
+      "MĠAAĠ": 288,
+      "UM": 289,
+      "RĠUWĠ": 290,
+      "ELL": 291,
+      "BER": 292,
+      "EYĠ": 293,
+      "NE": 294,
+      "SS": 295,
+      "TĠAHĠN</w>": 296,
+      "ENT": 297,
+      "TĠIHĠD</w>": 298,
+      "VĠERĠ": 299,
+      "OS</w>": 300,
+      "NĠAHĠS</w>": 301,
+      "DS</w>": 302,
+      "FOR": 303,
+      "MAR": 304,
+      "SĠIHĠZ</w>": 305,
+      "HE": 306,
+      "PER": 307,
+      "UW</w>": 308,
+      "KĠIHĠ": 309,
+      "MAN": 310,
+      "UT": 311,
+      "CH</w>": 312,
+      "IS</w>": 313,
+      "VĠAHĠ": 314,
+      "LĠD</w>": 315,
+      "MĠZ</w>": 316,
+      "HHĠEHĠ": 317,
+      "KĠAHĠNĠ": 318,
+      "TER": 319,
+      "CAR": 320,
+      "NĠIY</w>": 321,
+      "PĠIHĠ": 322,
+      "RAN": 323,
+      "HĠAEĠ": 324,
+      "OT": 325,
+      "TĠEHĠ": 326,
+      "ZĠAHĠ": 327,
+      "ĠAOĠR": 328,
+      "WĠAAĠ": 329,
+      "HHĠAEĠ": 330,
+      "BĠAAĠ": 331,
+      "DĠEHĠ": 332,
+      "MĠAHĠN</w>": 333,
+      "WH": 334,
+      "OĠRĠ": 335,
+      "INGS</w>": 336,
+      "PĠAEĠ": 337,
+      "BĠEHĠ": 338,
+      "SĠIYĠ": 339,
+      "LĠUWĠ": 340,
+      "JHĠAHĠ": 341,
+      "NĠAAĠ": 342,
+      "PĠEHĠ": 343,
+      "ĠAY": 344,
+      "BĠAEĠ": 345,
+      "IES</w>": 346,
+      "PĠERĠ": 347,
+      "AP": 348,
+      "EX": 349,
+      "TĠAEĠ": 350,
+      "ZĠIHĠ": 351,
+      "EST</w>": 352,
+      "ĠEHĠRĠAHĠ": 353,
+      "ĠIHĠNGĠZ</w>": 354,
+      "DĠAAĠ": 355,
+      "IA</w>": 356,
+      "WA": 357,
+      "JHĠIHĠ": 358,
+      "FĠR": 359,
+      "IZ": 360,
+      "ĠIYĠAH</w>": 361,
+      "ILL": 362,
+      "IV": 363,
+      "NĠAH</w>": 364,
+      "OD": 365,
+      "SK": 366,
+      "TĠERĠ": 367,
+      "ANT": 368,
+      "DĠR": 369,
+      "EST": 370,
+      "OG": 371,
+      "UW": 372,
+      "ĠEY</w>": 373,
+      "ANS</w>": 374,
+      "ENT</w>": 375,
+      "MĠPĠ": 376,
+      "AAĠRĠ": 377,
+      "EC": 378,
+      "MĠEYĠ": 379,
+      "ETT": 380,
+      "SHĠIHĠ": 381,
+      "GH": 382,
+      "PĠAAĠ": 383,
+      "TON</w>": 384,
+      "SĠTĠ": 385,
+      "DIS": 386,
+      "MP": 387,
+      "SĠAYĠ": 388,
+      "HĠAAĠ": 389,
+      "BE": 390,
+      "GU": 391,
+      "PAR": 392,
+      "RĠEHĠ": 393,
+      "SĠTĠR": 394,
+      "CHĠIHĠ": 395,
+      "BL": 396,
+      "HAR": 397,
+      "NĠEHĠ": 398,
+      "PRO": 399,
+      "FĠEHĠ": 400,
+      "LĠAHĠNĠ": 401,
+      "MĠAAĠRĠ": 402,
+      "RĠAAĠ": 403,
+      "TĠEYĠ": 404,
+      "BUR": 405,
+      "DĠAEĠ": 406,
+      "JH</w>": 407,
+      "INS</w>": 408,
+      "ATION</w>": 409,
+      "AK": 410,
+      "DĠERĠ": 411,
+      "MON": 412,
+      "PO": 413,
+      "PRE": 414,
+      "RĠAEĠ": 415,
+      "KĠOWĠ": 416,
+      "JHĠEHĠ": 417,
+      "OR</w>": 418,
+      "SI": 419,
+      "TĠAAĠ": 420,
+      "WĠERĠ": 421,
+      "FĠERĠ": 422,
+      "NI": 423,
+      "WĠEHĠ": 424,
+      "END": 425,
+      "CHĠAHĠ": 426,
+      "FĠAOĠRĠ": 427,
+      "MĠIYĠ": 428,
+      "SĠAAĠ": 429,
+      "TĠER</w>": 430,
+      "BAR": 431,
+      "EG": 432,
+      "EV": 433,
+      "HHĠAHĠ": 434,
+      "LAN": 435,
+      "TĠIHĠNG</w>": 436,
+      "LĠOWĠ": 437,
+      "SĠTĠAHĠ": 438,
+      "CI": 439,
+      "COR": 440,
+      "DĠER</w>": 441,
+      "GĠAAĠ": 442,
+      "LĠAOĠ": 443,
+      "MO": 444,
+      "TĠERĠZ</w>": 445,
+      "UD": 446,
+      "SĠTĠIHĠ": 447,
+      "OUR": 448,
+      "BĠAHĠL</w>": 449,
+      "DHĠ": 450,
+      "HA": 451,
+      "MĠAHĠNĠ": 452,
+      "ĠEYĠSHĠAHĠN</w>": 453,
+      "DĠEYĠ": 454,
+      "FI": 455,
+      "KĠAAĠRĠ": 456,
+      "LĠER</w>": 457,
+      "SĠIY</w>": 458,
+      "TĠIYĠ": 459,
+      "OUS": 460,
+      "ESS": 461,
+      "AST": 462,
+      "BR": 463,
+      "DER": 464,
+      "EL</w>": 465,
+      "KĠIY</w>": 466,
+      "KĠAAĠNĠ": 467,
+      "TĠAYĠ": 468,
+      "WĠEYĠ": 469,
+      "ENS</w>": 470,
+      "ATH": 471,
+      "ITY</w>": 472,
+      "EP": 473,
+      "IST": 474,
+      "KS</w>": 475,
+      "LĠIHĠNG</w>": 476,
+      "NĠAYĠ": 477,
+      "WĠUHĠ": 478,
+      "KĠAHĠL</w>": 479,
+      "HHĠAAĠRĠ": 480,
+      "IY</w>": 481,
+      "KĠAH</w>": 482,
+      "LĠAH</w>": 483,
+      "LĠIYĠZ</w>": 484,
+      "NG</w>": 485,
+      "NĠAEĠ": 486,
+      "SĠAHĠN</w>": 487,
+      "INE</w>": 488,
+      "ĠERĠAHĠ": 489,
+      "GĠIHĠ": 490,
+      "KĠAOĠRĠ": 491,
+      "SĠAEĠ": 492,
+      "ĠIYĠAHĠN</w>": 493,
+      "ISH": 494,
+      "GĠEYĠ": 495,
+      "KĠERĠ": 496,
+      "MĠAAĠNĠ": 497,
+      "TA": 498,
+      "WĠAOĠ": 499,
+      "ĠAYĠAHĠ": 500,
+      "MĠP": 501,
+      "PĠS</w>": 502,
+      "MAN</w>": 503,
+      "BU": 504,
+      "EY</w>": 505,
+      "KĠEYĠ": 506,
+      "LĠAEĠNĠ": 507,
+      "MOR": 508,
+      "TĠAH</w>": 509,
+      "TĠRĠAHĠ": 510,
+      "RĠZ</w>": 511
+    },
+    "merges": [
+      "H Ġ",
+      "Ġ A",
+      "Ġ I",
+      "ĠA HĠ",
+      "Ġ E",
+      "ĠI HĠ",
+      "Y Ġ",
+      "R Ġ",
+      "N Ġ",
+      "A Ġ",
+      "W Ġ",
+      "E Ġ",
+      "ĠA AĠ",
+      "ĠE HĠ",
+      "S Ġ",
+      "ĠA EĠ",
+      "I N",
+      "Ġ R",
+      "E R",
+      "ĠI YĠ",
+      "L Ġ",
+      "K Ġ",
+      "ĠE RĠ",
+      "HĠ A",
+      "ĠI Y</w>",
+      "O WĠ",
+      "A N",
+      "A R",
+      "ĠE YĠ",
+      "O Ġ",
+      "T Ġ",
+      "E N",
+      "O N",
+      "G Ġ",
+      "ĠAHĠ NĠ",
+      "E S</w>",
+      "ĠA OĠ",
+      "A T",
+      "U WĠ",
+      "ĠIHĠ N",
+      "O R",
+      "M Ġ",
+      "I HĠ",
+      "A L",
+      "ĠA YĠ",
+      "E L",
+      "ĠA H</w>",
+      "S T",
+      "D Ġ",
+      "E D</w>",
+      "C H",
+      "ĠE R</w>",
+      "HĠA HĠ",
+      "ĠAHĠ N</w>",
+      "R E",
+      "SĠ T",
+      "ĠI Y",
+      "E R</w>",
+      "I L",
+      "E HĠ",
+      "I S",
+      "P Ġ",
+      "I T",
+      "B Ġ",
+      "O U",
+      "E S",
+      "IN G</w>",
+      "ĠIHĠN G</w>",
+      "L ĠAHĠ",
+      "H HĠ",
+      "A HĠ",
+      "E T",
+      "O W</w>",
+      "O L",
+      "N ĠAHĠ",
+      "R A",
+      "I C",
+      "J HĠ",
+      "T ĠIHĠ",
+      "ĠE R",
+      "L ĠIY</w>",
+      "L ĠIHĠ",
+      "U N",
+      "ĠAAĠ RĠ",
+      "R O",
+      "K ĠAHĠ",
+      "S ĠIHĠ",
+      "TĠ S</w>",
+      "ĠERĠ Z</w>",
+      "O N</w>",
+      "H HĠA",
+      "L A",
+      "ĠAOĠ RĠ",
+      "D ĠIHĠ",
+      "S HĠ",
+      "ĠIYĠ Z</w>",
+      "S ĠAHĠ",
+      "M A",
+      "R I",
+      "C HĠ",
+      "ER S</w>",
+      "L E",
+      "NĠ Z</w>",
+      "T ĠAHĠ",
+      "S HĠAHĠ",
+      "U R",
+      "F Ġ",
+      "U HĠ",
+      "A M",
+      "ĠAEĠ NĠ",
+      "S H",
+      "A EĠ",
+      "ĠEHĠ R",
+      "IN G",
+      "ĠAAĠ NĠ",
+      "L I",
+      "ĠAHĠNĠ Z</w>",
+      "ĠR ĠAHĠ",
+      "LĠ Z</w>",
+      "T H",
+      "T I",
+      "D ĠAHĠ",
+      "ĠEHĠ NĠ",
+      "B ĠAHĠ",
+      "C O",
+      "M ĠAHĠ",
+      "L Y</w>",
+      "ĠIHĠN GĠ",
+      "D E",
+      "ĠR ĠIHĠ",
+      "L O",
+      "M ĠIHĠ",
+      "N GĠ",
+      "U L",
+      "N ĠIHĠ",
+      "ON S</w>",
+      "A B",
+      "T HĠ",
+      "W ĠIHĠ",
+      "A D",
+      "I N</w>",
+      "Q U",
+      "A AĠ",
+      "A S",
+      "K ĠAAĠ",
+      "Z Ġ",
+      "DĠ Z</w>",
+      "I R",
+      "P ĠAHĠ",
+      "SĠ T</w>",
+      "A C",
+      "R U",
+      "AT I",
+      "E M",
+      "SĠ K",
+      "ĠA WĠ",
+      "KĠ S</w>",
+      "E D",
+      "L E</w>",
+      "L ĠAEĠ",
+      "T S</w>",
+      "Y S</w>",
+      "I YĠ",
+      "V ĠIHĠ",
+      "E N</w>",
+      "O W",
+      "P ĠR",
+      "V Ġ",
+      "YĠ UWĠ",
+      "E RĠ",
+      "O O",
+      "A G",
+      "U S</w>",
+      "K ĠAEĠ",
+      "R ĠAHĠ",
+      "C ON",
+      "I D",
+      "B ĠERĠ",
+      "M ĠAEĠ",
+      "ĠA O",
+      "I G",
+      "L ĠIYĠ",
+      "Y ĠAHĠ",
+      "S ĠEHĠ",
+      "OWĠ Z</w>",
+      "B ĠIHĠ",
+      "ĠIY ĠAHĠ",
+      "I M",
+      "G ĠR",
+      "O M",
+      "S E",
+      "AN D",
+      "A S</w>",
+      "L ĠEHĠ",
+      "A U",
+      "IHĠ NĠ",
+      "C K",
+      "T ĠR",
+      "P P",
+      "ES S</w>",
+      "L ĠEYĠ",
+      "V ER",
+      "G ĠAHĠ",
+      "T ĠIY</w>",
+      "AHĠ NĠ",
+      "F ĠAHĠ",
+      "L ĠAYĠ",
+      "L ĠAAĠ",
+      "T H</w>",
+      "R ĠIYĠ",
+      "S U",
+      "A N</w>",
+      "R ĠIHĠ",
+      "S C",
+      "RĠ OWĠ",
+      "KĠ W",
+      "B O",
+      "F ĠIHĠ",
+      "SĠ P",
+      "P H",
+      "S P",
+      "A V",
+      "F F",
+      "B ĠR",
+      "ĠEYĠ SHĠAHĠ",
+      "OWĠ LĠ",
+      "A L</w>",
+      "ĠAHĠNĠ T</w>",
+      "K ĠR",
+      "M ĠEHĠ",
+      "HHĠA AĠ",
+      "D I",
+      "O YĠ",
+      "S H</w>",
+      "M ĠAAĠ",
+      "U M",
+      "RĠ UWĠ",
+      "EL L",
+      "B ER",
+      "E YĠ",
+      "N E",
+      "S S",
+      "T ĠAHĠN</w>",
+      "EN T",
+      "TĠIHĠ D</w>",
+      "V ĠERĠ",
+      "O S</w>",
+      "NĠAHĠ S</w>",
+      "D S</w>",
+      "F OR",
+      "M AR",
+      "SĠIHĠ Z</w>",
+      "H E",
+      "P ER",
+      "U W</w>",
+      "K ĠIHĠ",
+      "M AN",
+      "U T",
+      "C H</w>",
+      "I S</w>",
+      "V ĠAHĠ",
+      "LĠ D</w>",
+      "MĠ Z</w>",
+      "HHĠ EHĠ",
+      "K ĠAHĠNĠ",
+      "T ER",
+      "C AR",
+      "N ĠIY</w>",
+      "P ĠIHĠ",
+      "R AN",
+      "HĠA EĠ",
+      "O T",
+      "T ĠEHĠ",
+      "Z ĠAHĠ",
+      "ĠAO ĠR",
+      "W ĠAAĠ",
+      "HHĠA EĠ",
+      "B ĠAAĠ",
+      "D ĠEHĠ",
+      "M ĠAHĠN</w>",
+      "W H",
+      "OĠ RĠ",
+      "ING S</w>",
+      "P ĠAEĠ",
+      "B ĠEHĠ",
+      "S ĠIYĠ",
+      "LĠ UWĠ",
+      "J HĠAHĠ",
+      "N ĠAAĠ",
+      "P ĠEHĠ",
+      "ĠA Y",
+      "B ĠAEĠ",
+      "I ES</w>",
+      "P ĠERĠ",
+      "A P",
+      "E X",
+      "T ĠAEĠ",
+      "Z ĠIHĠ",
+      "ES T</w>",
+      "ĠEHĠR ĠAHĠ",
+      "ĠIHĠNGĠ Z</w>",
+      "D ĠAAĠ",
+      "I A</w>",
+      "W A",
+      "JHĠ IHĠ",
+      "F ĠR",
+      "I Z",
+      "ĠIY ĠAH</w>",
+      "IL L",
+      "I V",
+      "N ĠAH</w>",
+      "O D",
+      "S K",
+      "T ĠERĠ",
+      "AN T",
+      "D ĠR",
+      "E ST",
+      "O G",
+      "U W",
+      "ĠE Y</w>",
+      "AN S</w>",
+      "EN T</w>",
+      "MĠ PĠ",
+      "AAĠ RĠ",
+      "E C",
+      "M ĠEYĠ",
+      "ET T",
+      "SHĠ IHĠ",
+      "G H",
+      "P ĠAAĠ",
+      "T ON</w>",
+      "SĠ TĠ",
+      "D IS",
+      "M P",
+      "S ĠAYĠ",
+      "HĠA AĠ",
+      "B E",
+      "G U",
+      "P AR",
+      "R ĠEHĠ",
+      "SĠT ĠR",
+      "CHĠ IHĠ",
+      "B L",
+      "H AR",
+      "N ĠEHĠ",
+      "P RO",
+      "F ĠEHĠ",
+      "L ĠAHĠNĠ",
+      "M ĠAAĠRĠ",
+      "R ĠAAĠ",
+      "T ĠEYĠ",
+      "B UR",
+      "D ĠAEĠ",
+      "J H</w>",
+      "IN S</w>",
+      "ATI ON</w>",
+      "A K",
+      "D ĠERĠ",
+      "M ON",
+      "P O",
+      "P RE",
+      "R ĠAEĠ",
+      "KĠ OWĠ",
+      "JHĠ EHĠ",
+      "O R</w>",
+      "S I",
+      "T ĠAAĠ",
+      "W ĠERĠ",
+      "F ĠERĠ",
+      "N I",
+      "W ĠEHĠ",
+      "EN D",
+      "C HĠAHĠ",
+      "F ĠAOĠRĠ",
+      "M ĠIYĠ",
+      "S ĠAAĠ",
+      "T ĠER</w>",
+      "B AR",
+      "E G",
+      "E V",
+      "H HĠAHĠ",
+      "L AN",
+      "T ĠIHĠNG</w>",
+      "LĠ OWĠ",
+      "SĠT ĠAHĠ",
+      "C I",
+      "C OR",
+      "D ĠER</w>",
+      "G ĠAAĠ",
+      "L ĠAOĠ",
+      "M O",
+      "T ĠERĠZ</w>",
+      "U D",
+      "SĠT ĠIHĠ",
+      "OU R",
+      "BĠAHĠ L</w>",
+      "D HĠ",
+      "H A",
+      "M ĠAHĠNĠ",
+      "ĠEYĠSHĠAHĠ N</w>",
+      "D ĠEYĠ",
+      "F I",
+      "K ĠAAĠRĠ",
+      "L ĠER</w>",
+      "S ĠIY</w>",
+      "T ĠIYĠ",
+      "OU S",
+      "ES S",
+      "A ST",
+      "B R",
+      "D ER",
+      "E L</w>",
+      "K ĠIY</w>",
+      "K ĠAAĠNĠ",
+      "T ĠAYĠ",
+      "W ĠEYĠ",
+      "EN S</w>",
+      "AT H",
+      "IT Y</w>",
+      "E P",
+      "I ST",
+      "K S</w>",
+      "L ĠIHĠNG</w>",
+      "N ĠAYĠ",
+      "WĠ UHĠ",
+      "KĠAHĠ L</w>",
+      "HHĠAAĠ RĠ",
+      "I Y</w>",
+      "K ĠAH</w>",
+      "L ĠAH</w>",
+      "L ĠIYĠZ</w>",
+      "N G</w>",
+      "N ĠAEĠ",
+      "S ĠAHĠN</w>",
+      "IN E</w>",
+      "ĠER ĠAHĠ",
+      "G ĠIHĠ",
+      "K ĠAOĠRĠ",
+      "S ĠAEĠ",
+      "ĠIY ĠAHĠN</w>",
+      "IS H",
+      "G ĠEYĠ",
+      "K ĠERĠ",
+      "M ĠAAĠNĠ",
+      "T A",
+      "W ĠAOĠ",
+      "ĠA YĠAHĠ",
+      "MĠ P",
+      "PĠ S</w>",
+      "MA N</w>",
+      "B U",
+      "E Y</w>",
+      "K ĠEYĠ",
+      "L ĠAEĠNĠ",
+      "M OR",
+      "T ĠAH</w>",
+      "T ĠRĠAHĠ",
+      "RĠ Z</w>"
+    ]
+  }
+}

G2P_lexicon/my_tokenizer/my_dict_256.json ADDED Viewed

	@@ -0,0 +1,90 @@

+{
+  "0": "<sos>",
+  "1": "<eos>",
+  "2": "<unk>",
+  "3": "<pad>",
+  "4": "AA1",
+  "5": "UW",
+  "6": "ER0",
+  "7": "F",
+  "8": "CH",
+  "9": "S",
+  "10": "AO1",
+  "11": "DH",
+  "12": "TH",
+  "13": "IY",
+  "14": "OW",
+  "15": "AH2",
+  "16": "W",
+  "17": "AH1",
+  "18": "AO",
+  "19": "D",
+  "20": "AW1",
+  "21": "OY2",
+  "22": "AO0",
+  "23": "EY0",
+  "24": "AH",
+  "25": "AE",
+  "26": "UH2",
+  "27": "OW2",
+  "28": "UW0",
+  "29": "UW1",
+  "30": "UH1",
+  "31": "ER",
+  "32": "EH2",
+  "33": "UW2",
+  "34": "ER2",
+  "35": "OY",
+  "36": "AE0",
+  "37": "AY",
+  "38": "K",
+  "39": "AA0",
+  "40": "T",
+  "41": "EH0",
+  "42": "SH",
+  "43": "ER1",
+  "44": "G",
+  "45": "EY",
+  "46": "AH0",
+  "47": "IH0",
+  "48": "L",
+  "49": "AE2",
+  "50": "B",
+  "51": "OY0",
+  "52": "EH",
+  "53": "AA2",
+  "54": "IH",
+  "55": "M",
+  "56": "AY0",
+  "57": "UH",
+  "58": "EY2",
+  "59": "IY2",
+  "60": "EY1",
+  "61": "HH",
+  "62": "P",
+  "63": "AE1",
+  "64": "OW1",
+  "65": "R",
+  "66": "IH1",
+  "67": "Z",
+  "68": "IH2",
+  "69": "IY0",
+  "70": "V",
+  "71": "JH",
+  "72": "OY1",
+  "73": "Y",
+  "74": "N",
+  "75": "AO2",
+  "76": "AW",
+  "77": "UH0",
+  "78": "IY1",
+  "79": "AW0",
+  "80": "AA",
+  "81": "NG",
+  "82": "AY1",
+  "83": "EH1",
+  "84": "AY2",
+  "85": "OW0",
+  "86": "AW2",
+  "87": "ZH"
+}

G2P_lexicon/sp_tokenizer.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import json
+class Tokenizer_sp:
+    def __init__(self, config: dict = None, srs: bool = True, dict_path=None, text=None):
+        if config is None:
+            config = {}
+        self.sos = config.get('BOS_TOKEN', '<sos>')
+        self.eos = config.get('EOS_TOKEN', '<eos>')
+        self.unk = config.get('UNK_TOKEN', '<unk>')
+        self.pad = config.get('PAD_TOKEN', '<pad>')
+        self.tokens = []
+        self.srs = srs
+        if dict_path:
+            self.load_dict_from_file(dict_path)
+        elif text:
+            self.create_tokenizer(text)
+        else:
+            raise ValueError("Текстов нет")
+    def create_tokenizer(self, texts):
+        tokens = []
+        for phonemes_list in texts:
+            for phoneme in phonemes_list:
+                tokens.append(phoneme)
+        self.tokens = [self.sos, self.eos, self.unk, self.pad] + list(set(tokens))
+        self.token2idx = {token: int(i) for i, token in enumerate(self.tokens)}
+        self.idx2token = {int(i): token for i, token in enumerate(self.tokens)}
+        self.unk_idx = self.token2idx[self.unk]
+        self.sos_idx = self.token2idx[self.sos]
+        self.eos_idx = self.token2idx[self.eos]
+        self.pad_idx = self.token2idx[self.pad]
+    def load_dict_from_file(self, file_path):
+        with open(file_path, 'r') as file:
+            data = json.load(file)
+        self.idx2token = {int(token): idx for token, idx in data.items()}
+        self.token2idx = {idx: int(token) for token, idx in self.idx2token.items()}
+        self.unk_idx = self.token2idx.get(self.unk)
+        self.sos_idx = self.token2idx.get(self.sos)
+        self.eos_idx = self.token2idx.get(self.eos)
+        self.pad_idx = self.token2idx.get(self.pad)
+    def tokenize(self, text):
+        if not self.srs:
+            tokens = []
+            for tok in text:
+                if tok in self.token2idx:
+                    tokens.append(tok)
+                else:
+                    tokens.append(self.unk_idx)
+            return [self.sos] + tokens + [self.eos]
+        else:
+            return [self.sos] + list(text) + [self.eos]
+    def convert_tokens_to_idx(self, tokens):
+        idx_list = [self.token2idx.get(tok, self.unk_idx) for tok in tokens]
+        return idx_list
+    def encode(self, text, seq_len=None):
+        tokens = self.tokenize(text)[:seq_len]
+        return self.convert_tokens_to_idx(tokens)
+    def decode(self, idx_list):
+        ans = []
+        for idx in idx_list:
+            try:
+                ans.append(self.idx2token[int(idx)])
+            except KeyError:
+                ans.append(self.idx2token[self.unk_idx])
+        return ans
+    def get_vocab_size(self):
+        return len(self.token2idx)
+if __name__ == "__main__":
+    tokenizer_sp = Tokenizer_sp(dict_path='./my_tokenizer/my_dict_256.json')
+    print(tokenizer_sp.idx2token)

G2P_lexicon/transformer.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model, max_seq_length):
+        super(PositionalEncoding, self).__init__()
+        pe = torch.zeros(max_seq_length, d_model)
+        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        self.register_buffer('pe', pe.unsqueeze(0))
+    def forward(self, x):
+        return x + self.pe[:, :x.size(1)]
+class MultiHeadSelfAttention(nn.Module):
+    def __init__(self, d_model, num_heads):
+        super(MultiHeadSelfAttention, self).__init__()
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.depth = d_model // num_heads
+        self.wq = nn.Linear(d_model, d_model)
+        self.wk = nn.Linear(d_model, d_model)
+        self.wv = nn.Linear(d_model, d_model)
+        self.fc = nn.Linear(d_model, d_model)
+    def split_heads(self, x, batch_size):
+        x = x.view(batch_size, -1, self.num_heads, self.depth)
+        return x.permute(0, 2, 1, 3)
+    def forward(self, q, k, v, mask=None):
+        batch_size = q.size(0)
+        q = self.split_heads(self.wq(q), batch_size)
+        k = self.split_heads(self.wk(k), batch_size)
+        v = self.split_heads(self.wv(v), batch_size)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.depth, dtype=torch.float32))
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e9)
+        attn = F.softmax(scores, dim=-1)
+        out = torch.matmul(attn, v)
+        out = out.permute(0, 2, 1, 3).contiguous()
+        out = out.view(batch_size, -1, self.d_model)
+        out = self.fc(out)
+        return out
+class FeedForwardNetwork(nn.Module):
+    def __init__(self, d_model, d_ff, dropout=0.1):
+        super(FeedForwardNetwork, self).__init__()
+        self.fc1 = nn.Linear(d_model, d_ff)
+        self.fc2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = F.relu(x)
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return x
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
+        super(EncoderLayer, self).__init__()
+        self.self_attn = MultiHeadSelfAttention(d_model, num_heads)
+        self.ffn = FeedForwardNetwork(d_model, d_ff, dropout)
+        self.layernorm1 = nn.LayerNorm(d_model)
+        self.layernorm2 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, mask=None):
+        attn_output = self.self_attn(x, x, x, mask)
+        x = self.layernorm1(x + self.dropout(attn_output))
+        ffn_output = self.ffn(x)
+        x = self.layernorm2(x + self.dropout(ffn_output))
+        return x
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
+        super(DecoderLayer, self).__init__()
+        self.self_attn = MultiHeadSelfAttention(d_model, num_heads)
+        self.cross_attn = MultiHeadSelfAttention(d_model, num_heads)
+        self.ffn = FeedForwardNetwork(d_model, d_ff, dropout)
+        self.layernorm1 = nn.LayerNorm(d_model)
+        self.layernorm2 = nn.LayerNorm(d_model)
+        self.layernorm3 = nn.LayerNorm(d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
+        self_attn_output = self.self_attn(q=x, k=x, v=x, mask=tgt_mask)
+        x = self.layernorm1(x + self.dropout(self_attn_output))
+        cross_attn_output = self.cross_attn(q=x, k=enc_output, v=enc_output, mask=src_mask)
+        x = self.layernorm2(x + self.dropout(cross_attn_output))
+        ffn_output = self.ffn(x)
+        x = self.layernorm3(x + self.dropout(ffn_output))
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, tokenizer=None, config=None, stress=False):
+        super(TransformerBlock, self).__init__()
+        self.config = config
+        self.tokenizer = tokenizer
+        self.input_vocab_size = tokenizer.get_vocab_size()
+        self.target_vocab_size = tokenizer.get_vocab_size()
+        self.d_model = config.get('D_MODEL', 512)
+        self.num_heads = config.get('NUM_HEADS', 8)
+        self.num_encoder_layers = config.get('NUM', 6)
+        self.num_decoder_layers = config.get('NUM', 6)
+        self.d_ff = config.get('D_FF', 2048)
+        self.dropout = config.get('DROPOUT', 0.1)
+        self.stress = stress
+        self.encoder_embedding = nn.Embedding(self.input_vocab_size, self.d_model)
+        self.decoder_embedding = nn.Embedding(self.target_vocab_size, self.d_model)
+        self.pos_embedding = PositionalEncoding(self.d_model, config.get('MAX_LEN', 32))
+        self.encoder_layers = nn.ModuleList(
+            [EncoderLayer(self.d_model, self.num_heads, self.d_ff, self.dropout) for _ in
+             range(self.num_encoder_layers)])
+        self.decoder_layers = nn.ModuleList(
+            [DecoderLayer(self.d_model, self.num_heads, self.d_ff, self.dropout) for _ in
+             range(self.num_decoder_layers)])
+        self.fc_out = nn.Linear(self.d_model, self.target_vocab_size)
+    def encode(self, src, src_mask):
+        src = self.pos_embedding(self.encoder_embedding(src))
+        for layer in self.encoder_layers:
+            src = layer(src, src_mask)
+        return src
+    def decode(self, memory, src_mask, tgt, tgt_mask):
+        tgt = self.pos_embedding(self.decoder_embedding(tgt))
+        for layer in self.decoder_layers:
+            tgt = layer(tgt, memory, src_mask, tgt_mask)
+        return tgt
+    def forward(self, src, tgt, src_mask, tgt_mask):
+        memory = self.encode(src, src_mask)
+        output = self.decode(memory, src_mask, tgt, tgt_mask)
+        output = self.fc_out(output)
+        return output

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 NikiPshg
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,45 @@
----
-license: mit
----

+# Grapheme to Phoneme (G2P) with Stress
+This project provides a Grapheme to Phoneme (G2P) conversion tool that first checks the CMU Pronouncing Dictionary for phoneme translations. If a word is not found in the dictionary, it utilizes two Transformer-based models to generate phoneme translations and add stress markers. The output is in ARPAbet format, and the model can also convert graphemes into phoneme integer indices.
+## Features
+1. **CMU Pronouncing Dictionary Integration**: First checks the CMU dictionary for phoneme translations.
+2. **Transformer-Based Conversion**:
+    - **Phoneme Generation**: The first Transformer model converts graphemes into phonemes.
+    - **Stress Addition**: The second Transformer model adds stress markers to the phonemes.
+3. **ARPAbet Output**: Outputs phonemes in ARPAbet format.
+4. **Phoneme Integer Indices**: Converts graphemes to phoneme integer indices.
+## Installation
+1. Clone the repository:
+    ```sh
+    git clone https://github.com/NikiPshg/G2P_en_lex.git
+    cd G2P_en_lex
+    ```
+2. Install the required dependencies:
+    ```sh
+    pip install -r requiremenst.txt
+    ```
+### Example
+```python
+from G2P_lexicon import g2p_en_lexicon
+# Initialize the G2P converter
+g2p_converter = g2p_en_lexicon()
+# Convert a word to phonemes
+text = "text, numbers, and some strange symbols !№;% 21"
+phonemes = G2P_en_lex(text, with_stress=False)
+['T', 'EH', 'K', 'S', 'T', ' ', ',', ' ', 'N', 'AH', 'M', 'B', 'ER', 'Z', ' ', ',', ' ',
+'AH', 'N', 'D', ' ', 'S', 'AH', 'M', ' ', 'S', 'T', 'R', 'EY', 'N', 'JH', ' ',
+'S', 'IH', 'M', 'B', 'AH', 'L', 'Z', ' ',
+'T', 'W', 'EH', 'N', 'IY', 'W', 'AH', 'N']

requiremenst.txt ADDED Viewed

Binary file (884 Bytes). View file

test.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from G2P_lexicon import g2p_en_lexicon
+text = "text, numbers, and some strange symbols !№;% 21"
+g2p = g2p_en_lexicon()
+phonemes = g2p(text, with_stress=False)
+print(phonemes)