NikiPshg commited on
Commit
9ba7d3b
·
verified ·
1 Parent(s): 673d6e3

Upload 27 files

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ venv
2
+ .idea
3
+ **/__pycache__/
G2P_lexicon/G2P.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from G2P_lexicon.transformer import TransformerBlock
3
+ from tokenizers import Tokenizer
4
+ from G2P_lexicon.config_models import config_g2p
5
+ import os
6
+
7
+ dirname = os.path.dirname(__file__)
8
+
9
+
10
+ def decode_form_G(tokens: str):
11
+ """
12
+ Converts model output to a readable format.
13
+ Args:
14
+ tokens: 'NĠAH1ĠMĠBĠER0ĠZ'
15
+ Returns:
16
+ ['N', 'AH1', 'M', 'B', 'ER0', 'Z']
17
+ """
18
+ return ''.join(tokens).split('Ġ')
19
+
20
+
21
+ class GraphemeToPhoneme:
22
+ def __init__(self,
23
+ model,
24
+ tokenizer):
25
+
26
+ self.g2p_model = model
27
+ self.tokenizer = tokenizer
28
+
29
+ self.g2p_model.eval()
30
+
31
+ def greedy_decode_grapheme(self, model,
32
+ src,
33
+ src_mask,
34
+ max_len,
35
+ start_token):
36
+ src = src.unsqueeze(0)
37
+ src_mask = src_mask.unsqueeze(0)
38
+ input_decoder = model.encode(src, src_mask)
39
+ label = torch.zeros(1, 1).fill_(start_token).type_as(src.data)
40
+
41
+ for _ in range(max_len - 1):
42
+ tgt_mask = (torch.tril(torch.ones((label.size(1), label.size(1)))).type_as(src.data)).unsqueeze(0)
43
+ out = model.decode(input_decoder, src_mask, label, tgt_mask)
44
+ prob = model.fc_out(out[:, -1])
45
+ _, next_word = torch.max(prob, dim=1)
46
+ next_word = next_word.item()
47
+ label = torch.cat([label, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
48
+ if next_word == self.tokenizer.encode("<eos>").ids[0]:
49
+ break
50
+
51
+ pred = decode_form_G(self.tokenizer.decode(label[0].tolist()))
52
+ return pred
53
+
54
+ def __call__(self, srs):
55
+ with torch.no_grad():
56
+ enc_input_tokens = self.tokenizer.encode(srs).ids
57
+ pad_id = self.tokenizer.encode("<pad>").ids[0]
58
+ enc_num_padding_tokens = 32 - len(enc_input_tokens) - 2
59
+ encoder_input = torch.cat([
60
+ torch.tensor([self.tokenizer.encode("<bos>").ids[0]]),
61
+ torch.tensor(enc_input_tokens),
62
+ torch.tensor([self.tokenizer.encode("<eos>").ids[0]]),
63
+ torch.tensor([pad_id] * enc_num_padding_tokens)
64
+ ], dim=0)
65
+
66
+ encoder_mask = (encoder_input != pad_id).unsqueeze(0).unsqueeze(0).int()
67
+ pred = self.greedy_decode_grapheme(
68
+ model=self.g2p_model,
69
+ src=encoder_input,
70
+ src_mask=encoder_mask,
71
+ max_len=32,
72
+ start_token=self.tokenizer.encode("<bos>").ids[0]
73
+ )
74
+ return pred
75
+
76
+
77
+ dict_path = os.path.join(dirname, "my_tokenizer/bpe_512_lex.json")
78
+ model_path = os.path.join(dirname, "models/model0.07.pt")
79
+
80
+ tokenizer_g2p = Tokenizer.from_file(dict_path)
81
+ g2p_model = TransformerBlock(config=config_g2p, tokenizer=tokenizer_g2p)
82
+ g2p_model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
83
+
84
+ G2P = GraphemeToPhoneme(g2p_model, tokenizer_g2p)
85
+
86
+ if __name__ == '__main__':
87
+ print(G2P('NIKITA')) # Expected output:['N', 'IH', 'K', 'IY', 'T', 'AH']
G2P_lexicon/G2P_en_lexicon.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from G2P_lexicon.G2P import G2P
2
+ from G2P_lexicon.SP import SP
3
+ from G2P_lexicon.data_preparation import preprocess_text
4
+ import string
5
+ import json
6
+ import time
7
+ import os
8
+
9
+
10
+ dirname = os.path.dirname(__file__)
11
+ json_path = os.path.join(dirname, "data/word2phoneme.json")
12
+
13
+ with open(json_path) as json_file:
14
+ phoneme2grapheme_dict = json.load(json_file)
15
+
16
+
17
+ class g2p_en_lexicon:
18
+ def __init__(self):
19
+ self.G2P = G2P
20
+ self.SP = SP
21
+
22
+ def cleaan_stress(self, seq: list):
23
+ return [phoneme[:-1] if phoneme[-1].isdigit() else phoneme for phoneme in seq]
24
+
25
+ def pred_with_stress(self, seq):
26
+ return self.SP(self.G2P(seq))
27
+
28
+ def check_punctuation(self, word):
29
+ return any(char in string.punctuation for char in word)
30
+
31
+ def __call__(self, seq, with_stress=True):
32
+ seq_list = preprocess_text(seq)
33
+ result = []
34
+ count_from_dict = 0
35
+ count_from_model = 0
36
+ for word in seq_list:
37
+ phonemes_from_dict = phoneme2grapheme_dict.get(word)
38
+ if phonemes_from_dict is None:
39
+ if self.check_punctuation(word):
40
+ result.extend([word] + [' '])
41
+ else:
42
+ count_from_model += 1
43
+ if with_stress:
44
+ pred_stress = self.pred_with_stress(word)
45
+ #print(f"{word} -- {pred_stress}")
46
+ result.extend(pred_stress + [' '])
47
+ else:
48
+ pred_without = self.G2P(word)
49
+ #print(f"{word} -- {pred_without}")
50
+ result.extend(pred_without + [' '])
51
+ else:
52
+ count_from_dict += 1
53
+ result.extend(phonemes_from_dict + [' '])
54
+
55
+ #print(f"{count_from_dict} -- from json\n"
56
+ #f"{count_from_model} -- from model")
57
+ result = result[:-1] if result[-1] == ' ' else result
58
+
59
+ if not with_stress:
60
+ return self.cleaan_stress(result)
61
+ return result
62
+
63
+
64
+ if __name__ == '__main__':
65
+ G2P_en_lexicon = g2p_en_lexicon()
66
+ text = """mtusi is the worst option for a programmer or a student"""
67
+ start_time = time.time()
68
+ print(G2P_en_lexicon(text))
69
+ end_time = time.time()
70
+ print(f"{(end_time - start_time) * 1000} мc -- за это была выполнена ")
G2P_lexicon/SP.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from G2P_lexicon.config_models import config_sp
2
+ from G2P_lexicon.transformer import TransformerBlock
3
+ from G2P_lexicon.sp_tokenizer import Tokenizer_sp
4
+ import torch
5
+ import os
6
+
7
+ dirname = os.path.dirname(__file__)
8
+
9
+
10
+ class Stress_Pred:
11
+ def __init__(self,
12
+ model,
13
+ tokenizer):
14
+
15
+ self.SP = model
16
+ self.tokenizer = tokenizer
17
+
18
+ self.SP.eval()
19
+
20
+ def __call__(self, srs):
21
+ with torch.no_grad():
22
+ enc_input_tokens = self.tokenizer.encode(srs)
23
+ pad_id = torch.tensor(self.tokenizer.pad_idx)
24
+ enc_num_padding_tokens = 32 - len(enc_input_tokens)
25
+ encoder_input = torch.cat(
26
+ [
27
+ torch.tensor(enc_input_tokens),
28
+ pad_id.repeat(enc_num_padding_tokens)
29
+ ],
30
+ dim=0)
31
+
32
+ encoder_mask = (encoder_input != pad_id).unsqueeze(0).unsqueeze(0).int()
33
+ label = self.greedy_decode_stress(
34
+ src=encoder_input,
35
+ src_mask=encoder_mask,
36
+ start_token=self.tokenizer.sos_idx,
37
+ )
38
+ return label
39
+
40
+ def greedy_decode_stress(self,
41
+ src,
42
+ src_mask,
43
+ start_token):
44
+ len_src = (src != 3).int().sum().item()
45
+ index_vowels = torch.tensor([(idx) for (idx, i) in enumerate(src) if not (i in list_tokens_without_stress)])[
46
+ :len_src]
47
+ src = src.unsqueeze(0)
48
+ src_mask = src_mask.unsqueeze(0)
49
+ input_decoder = self.SP.encode(src, src_mask)
50
+ label = torch.tensor([]).type_as(src.data)
51
+
52
+ for idx in range(len_src):
53
+ if idx in index_vowels:
54
+ label = torch.cat([label, torch.ones(1, 1).type_as(src.data).fill_(src[0][idx])], dim=1)
55
+ else:
56
+ tgt_mask = (torch.tril(torch.ones((label.size(1), label.size(1)))).type_as(src.data)).unsqueeze(0)
57
+ out = self.SP.decode(input_decoder, src_mask, label, tgt_mask)
58
+ prob = self.SP.fc_out(out[:, -1])
59
+
60
+ _, next_word = torch.max(prob, dim=1)
61
+ next_word = next_word.data[0]
62
+ label = torch.cat([label, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
63
+
64
+ pred = self.tokenizer.decode(label[0].tolist())[1:-1]
65
+ return pred
66
+
67
+
68
+ dict_path = os.path.join(dirname, "my_tokenizer\my_dict_256.json")
69
+ model_path = os.path.join(dirname, "models\model_0.159.pt")
70
+
71
+ tokenizer_sp = Tokenizer_sp(dict_path=dict_path)
72
+
73
+ set_tokens_without_stress = set()
74
+ for token, phoneme in tokenizer_sp.idx2token.items():
75
+ if phoneme[-1].isdigit():
76
+ set_tokens_without_stress.add(tokenizer_sp.token2idx[phoneme[:-1]])
77
+ list_tokens_without_stress = list(set_tokens_without_stress)
78
+
79
+ sp_model = TransformerBlock(config=config_sp,
80
+ tokenizer=tokenizer_sp)
81
+ sp_model.load_state_dict(
82
+ torch.load(model_path, map_location=torch.device('cpu')))
83
+
84
+ SP = Stress_Pred(model=sp_model,
85
+ tokenizer=tokenizer_sp)
86
+
87
+ if __name__ == '__main__':
88
+ print(SP(['N', 'IH', 'K', 'IY', 'T', 'AH'])) #['N', 'IH2', 'K', 'IY1', 'T', 'AH0']
G2P_lexicon/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from G2P_lexicon.G2P_en_lexicon import g2p_en_lexicon
G2P_lexicon/__pycache__/G2P.cpython-311.pyc ADDED
Binary file (5.97 kB). View file
 
G2P_lexicon/__pycache__/G2P_en_lexicon.cpython-311.pyc ADDED
Binary file (4.38 kB). View file
 
G2P_lexicon/__pycache__/SP.cpython-311.pyc ADDED
Binary file (6.28 kB). View file
 
G2P_lexicon/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (233 Bytes). View file
 
G2P_lexicon/__pycache__/config_models.cpython-311.pyc ADDED
Binary file (343 Bytes). View file
 
G2P_lexicon/__pycache__/data_preparation.cpython-311.pyc ADDED
Binary file (3.31 kB). View file
 
G2P_lexicon/__pycache__/sp_tokenizer.cpython-311.pyc ADDED
Binary file (6.76 kB). View file
 
G2P_lexicon/__pycache__/transformer.cpython-311.pyc ADDED
Binary file (13.7 kB). View file
 
G2P_lexicon/config_models.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config_sp = {
2
+ "D_MODEL": 256,
3
+ "D_FF": 1024,
4
+ "NUM": 3,
5
+ "NUM_HEADS": 4,
6
+ "MAX_LEN": 32,
7
+ }
8
+
9
+ config_g2p = {
10
+ "D_MODEL": 512,
11
+ "D_FF": 2048,
12
+ "NUM": 6,
13
+ "NUM_HEADS": 8,
14
+ "MAX_LEN": 32,
15
+ }
G2P_lexicon/data/word2phoneme.json ADDED
The diff for this file is too large to render. See raw diff
 
G2P_lexicon/data_preparation.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def intToWord(number):
5
+ ones = ("", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine")
6
+ tens = ("", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety")
7
+ teens = (
8
+ "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen")
9
+ levels = (
10
+ "", "thousand", "million", "billion", "trillion", "quadrillion", "quintillion", "sextillion", "septillion",
11
+ "octillion", "nonillion")
12
+
13
+ word = ""
14
+ num = reversed(str(number))
15
+ number = ""
16
+ for x in num:
17
+ number += x
18
+ del num
19
+ if len(number) % 3 == 1: number += "0"
20
+ x = 0
21
+ for digit in number:
22
+ if x % 3 == 0:
23
+ word = levels[x // 3] + " " + word
24
+ n = int(digit)
25
+ elif x % 3 == 1:
26
+ if digit == "1":
27
+ num = teens[n]
28
+ else:
29
+ num = tens[int(digit)]
30
+ if n:
31
+ if num:
32
+ num += ones[n]
33
+ else:
34
+ num = ones[n]
35
+ word = num + " " + word
36
+ elif x % 3 == 2:
37
+ if digit != "0":
38
+ word = ones[int(digit)] + " hundred " + word
39
+ x += 1
40
+ return word.strip(" ")
41
+
42
+
43
+ def preprocess_text(text):
44
+ """
45
+ Приведение к нормальному виду с отделенными точками и запятыми
46
+ srs:
47
+ Hello, World! This is a sample text with numbers 12345 and symbols #$%.
48
+ return:
49
+ ['HELLO', ',', 'WORLD', 'THIS', 'IS', 'A', 'SAMPLE', 'TEXT', 'WITH', 'NUMBERS', 'AND', 'SYMBOLS', '.']
50
+ """
51
+ if not(text.isspace()) and text and text:
52
+
53
+ text = text.upper()
54
+ text = re.sub(r'([.,])', r' \1 ', text)
55
+
56
+ text = re.sub(r'[^A-Z .,^0-9]', '', text)
57
+ text = re.sub(r'\s+', ' ', text).strip()
58
+
59
+ text = text.split()
60
+ result = []
61
+ for word in text:
62
+ if word.isdigit():
63
+ result = result + (intToWord(word).upper()).split()
64
+ else:
65
+ result.append(word)
66
+ else:
67
+ result = ['текст введи :(']
68
+
69
+ return result
70
+
71
+
72
+ if __name__ == "__main__":
73
+ sample_text = "Hello, World! This is a sample text with numbers 12345 and symbols #$%."
74
+ processed_text = preprocess_text(sample_text)
75
+ print("Processed text:", processed_text)
G2P_lexicon/models/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ model_0.159.pt
2
+ model0.07.pt
G2P_lexicon/models/model0.07.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7bb918136264dac82f564a0fe07964607e9494701a8018eaa5c2be16c6bd89a
3
+ size 179866303
G2P_lexicon/models/model_0.159.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce2f8269e96abaf00086f4c61043046656deb8cf397ce7f1501d2f354dd6bea7
3
+ size 22471914
G2P_lexicon/my_tokenizer/bpe_512_lex.json ADDED
@@ -0,0 +1,1042 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 0,
8
+ "content": "<unk>",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 512,
17
+ "content": "<pad>",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 513,
26
+ "content": "<bos>",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 514,
35
+ "content": "<eos>",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ }
42
+ ],
43
+ "normalizer": {
44
+ "type": "BertNormalizer",
45
+ "clean_text": true,
46
+ "handle_chinese_chars": true,
47
+ "strip_accents": null,
48
+ "lowercase": false
49
+ },
50
+ "pre_tokenizer": {
51
+ "type": "BertPreTokenizer"
52
+ },
53
+ "post_processor": null,
54
+ "decoder": {
55
+ "type": "BPEDecoder",
56
+ "suffix": "</w>"
57
+ },
58
+ "model": {
59
+ "type": "BPE",
60
+ "dropout": null,
61
+ "unk_token": "<unk>",
62
+ "continuing_subword_prefix": null,
63
+ "end_of_word_suffix": "</w>",
64
+ "fuse_unk": false,
65
+ "byte_fallback": false,
66
+ "ignore_merges": false,
67
+ "vocab": {
68
+ "<unk>": 0,
69
+ "A": 1,
70
+ "B": 2,
71
+ "C": 3,
72
+ "D": 4,
73
+ "E": 5,
74
+ "F": 6,
75
+ "G": 7,
76
+ "H": 8,
77
+ "I": 9,
78
+ "J": 10,
79
+ "K": 11,
80
+ "L": 12,
81
+ "M": 13,
82
+ "N": 14,
83
+ "O": 15,
84
+ "P": 16,
85
+ "Q": 17,
86
+ "R": 18,
87
+ "S": 19,
88
+ "T": 20,
89
+ "U": 21,
90
+ "V": 22,
91
+ "W": 23,
92
+ "X": 24,
93
+ "Y": 25,
94
+ "Z": 26,
95
+ "Ġ": 27,
96
+ "G</w>": 28,
97
+ "E</w>": 29,
98
+ "A</w>": 30,
99
+ "D</w>": 31,
100
+ "S</w>": 32,
101
+ "L</w>": 33,
102
+ "Z</w>": 34,
103
+ "N</w>": 35,
104
+ "Y</w>": 36,
105
+ "T</w>": 37,
106
+ "H</w>": 38,
107
+ "R</w>": 39,
108
+ "F</w>": 40,
109
+ "W</w>": 41,
110
+ "P</w>": 42,
111
+ "V</w>": 43,
112
+ "M</w>": 44,
113
+ "K</w>": 45,
114
+ "U</w>": 46,
115
+ "B</w>": 47,
116
+ "I</w>": 48,
117
+ "C</w>": 49,
118
+ "O</w>": 50,
119
+ "X</w>": 51,
120
+ "J</w>": 52,
121
+ "Q</w>": 53,
122
+ "HĠ": 54,
123
+ "ĠA": 55,
124
+ "ĠI": 56,
125
+ "ĠAHĠ": 57,
126
+ "ĠE": 58,
127
+ "ĠIHĠ": 59,
128
+ "YĠ": 60,
129
+ "RĠ": 61,
130
+ "NĠ": 62,
131
+ "AĠ": 63,
132
+ "WĠ": 64,
133
+ "EĠ": 65,
134
+ "ĠAAĠ": 66,
135
+ "ĠEHĠ": 67,
136
+ "SĠ": 68,
137
+ "ĠAEĠ": 69,
138
+ "IN": 70,
139
+ "ĠR": 71,
140
+ "ER": 72,
141
+ "ĠIYĠ": 73,
142
+ "LĠ": 74,
143
+ "KĠ": 75,
144
+ "ĠERĠ": 76,
145
+ "HĠA": 77,
146
+ "ĠIY</w>": 78,
147
+ "OWĠ": 79,
148
+ "AN": 80,
149
+ "AR": 81,
150
+ "ĠEYĠ": 82,
151
+ "OĠ": 83,
152
+ "TĠ": 84,
153
+ "EN": 85,
154
+ "ON": 86,
155
+ "GĠ": 87,
156
+ "ĠAHĠNĠ": 88,
157
+ "ES</w>": 89,
158
+ "ĠAOĠ": 90,
159
+ "AT": 91,
160
+ "UWĠ": 92,
161
+ "ĠIHĠN": 93,
162
+ "OR": 94,
163
+ "MĠ": 95,
164
+ "IHĠ": 96,
165
+ "AL": 97,
166
+ "ĠAYĠ": 98,
167
+ "EL": 99,
168
+ "ĠAH</w>": 100,
169
+ "ST": 101,
170
+ "DĠ": 102,
171
+ "ED</w>": 103,
172
+ "CH": 104,
173
+ "ĠER</w>": 105,
174
+ "HĠAHĠ": 106,
175
+ "ĠAHĠN</w>": 107,
176
+ "RE": 108,
177
+ "SĠT": 109,
178
+ "ĠIY": 110,
179
+ "ER</w>": 111,
180
+ "IL": 112,
181
+ "EHĠ": 113,
182
+ "IS": 114,
183
+ "PĠ": 115,
184
+ "IT": 116,
185
+ "BĠ": 117,
186
+ "OU": 118,
187
+ "ES": 119,
188
+ "ING</w>": 120,
189
+ "ĠIHĠNG</w>": 121,
190
+ "LĠAHĠ": 122,
191
+ "HHĠ": 123,
192
+ "AHĠ": 124,
193
+ "ET": 125,
194
+ "OW</w>": 126,
195
+ "OL": 127,
196
+ "NĠAHĠ": 128,
197
+ "RA": 129,
198
+ "IC": 130,
199
+ "JHĠ": 131,
200
+ "TĠIHĠ": 132,
201
+ "ĠER": 133,
202
+ "LĠIY</w>": 134,
203
+ "LĠIHĠ": 135,
204
+ "UN": 136,
205
+ "ĠAAĠRĠ": 137,
206
+ "RO": 138,
207
+ "KĠAHĠ": 139,
208
+ "SĠIHĠ": 140,
209
+ "TĠS</w>": 141,
210
+ "ĠERĠZ</w>": 142,
211
+ "ON</w>": 143,
212
+ "HHĠA": 144,
213
+ "LA": 145,
214
+ "ĠAOĠRĠ": 146,
215
+ "DĠIHĠ": 147,
216
+ "SHĠ": 148,
217
+ "ĠIYĠZ</w>": 149,
218
+ "SĠAHĠ": 150,
219
+ "MA": 151,
220
+ "RI": 152,
221
+ "CHĠ": 153,
222
+ "ERS</w>": 154,
223
+ "LE": 155,
224
+ "NĠZ</w>": 156,
225
+ "TĠAHĠ": 157,
226
+ "SHĠAHĠ": 158,
227
+ "UR": 159,
228
+ "FĠ": 160,
229
+ "UHĠ": 161,
230
+ "AM": 162,
231
+ "ĠAEĠNĠ": 163,
232
+ "SH": 164,
233
+ "AEĠ": 165,
234
+ "ĠEHĠR": 166,
235
+ "ING": 167,
236
+ "ĠAAĠNĠ": 168,
237
+ "LI": 169,
238
+ "ĠAHĠNĠZ</w>": 170,
239
+ "ĠRĠAHĠ": 171,
240
+ "LĠZ</w>": 172,
241
+ "TH": 173,
242
+ "TI": 174,
243
+ "DĠAHĠ": 175,
244
+ "ĠEHĠNĠ": 176,
245
+ "BĠAHĠ": 177,
246
+ "CO": 178,
247
+ "MĠAHĠ": 179,
248
+ "LY</w>": 180,
249
+ "ĠIHĠNGĠ": 181,
250
+ "DE": 182,
251
+ "ĠRĠIHĠ": 183,
252
+ "LO": 184,
253
+ "MĠIHĠ": 185,
254
+ "NG��": 186,
255
+ "UL": 187,
256
+ "NĠIHĠ": 188,
257
+ "ONS</w>": 189,
258
+ "AB": 190,
259
+ "THĠ": 191,
260
+ "WĠIHĠ": 192,
261
+ "AD": 193,
262
+ "IN</w>": 194,
263
+ "QU": 195,
264
+ "AAĠ": 196,
265
+ "AS": 197,
266
+ "KĠAAĠ": 198,
267
+ "ZĠ": 199,
268
+ "DĠZ</w>": 200,
269
+ "IR": 201,
270
+ "PĠAHĠ": 202,
271
+ "SĠT</w>": 203,
272
+ "AC": 204,
273
+ "RU": 205,
274
+ "ATI": 206,
275
+ "EM": 207,
276
+ "SĠK": 208,
277
+ "ĠAWĠ": 209,
278
+ "KĠS</w>": 210,
279
+ "ED": 211,
280
+ "LE</w>": 212,
281
+ "LĠAEĠ": 213,
282
+ "TS</w>": 214,
283
+ "YS</w>": 215,
284
+ "IYĠ": 216,
285
+ "VĠIHĠ": 217,
286
+ "EN</w>": 218,
287
+ "OW": 219,
288
+ "PĠR": 220,
289
+ "VĠ": 221,
290
+ "YĠUWĠ": 222,
291
+ "ERĠ": 223,
292
+ "OO": 224,
293
+ "AG": 225,
294
+ "US</w>": 226,
295
+ "KĠAEĠ": 227,
296
+ "RĠAHĠ": 228,
297
+ "CON": 229,
298
+ "ID": 230,
299
+ "BĠERĠ": 231,
300
+ "MĠAEĠ": 232,
301
+ "ĠAO": 233,
302
+ "IG": 234,
303
+ "LĠIYĠ": 235,
304
+ "YĠAHĠ": 236,
305
+ "SĠEHĠ": 237,
306
+ "OWĠZ</w>": 238,
307
+ "BĠIHĠ": 239,
308
+ "ĠIYĠAHĠ": 240,
309
+ "IM": 241,
310
+ "GĠR": 242,
311
+ "OM": 243,
312
+ "SE": 244,
313
+ "AND": 245,
314
+ "AS</w>": 246,
315
+ "LĠEHĠ": 247,
316
+ "AU": 248,
317
+ "IHĠNĠ": 249,
318
+ "CK": 250,
319
+ "TĠR": 251,
320
+ "PP": 252,
321
+ "ESS</w>": 253,
322
+ "LĠEYĠ": 254,
323
+ "VER": 255,
324
+ "GĠAHĠ": 256,
325
+ "TĠIY</w>": 257,
326
+ "AHĠNĠ": 258,
327
+ "FĠAHĠ": 259,
328
+ "LĠAYĠ": 260,
329
+ "LĠAAĠ": 261,
330
+ "TH</w>": 262,
331
+ "RĠIYĠ": 263,
332
+ "SU": 264,
333
+ "AN</w>": 265,
334
+ "RĠIHĠ": 266,
335
+ "SC": 267,
336
+ "RĠOWĠ": 268,
337
+ "KĠW": 269,
338
+ "BO": 270,
339
+ "FĠIHĠ": 271,
340
+ "SĠP": 272,
341
+ "PH": 273,
342
+ "SP": 274,
343
+ "AV": 275,
344
+ "FF": 276,
345
+ "BĠR": 277,
346
+ "ĠEYĠSHĠAHĠ": 278,
347
+ "OWĠLĠ": 279,
348
+ "AL</w>": 280,
349
+ "ĠAHĠNĠT</w>": 281,
350
+ "KĠR": 282,
351
+ "MĠEHĠ": 283,
352
+ "HHĠAAĠ": 284,
353
+ "DI": 285,
354
+ "OYĠ": 286,
355
+ "SH</w>": 287,
356
+ "MĠAAĠ": 288,
357
+ "UM": 289,
358
+ "RĠUWĠ": 290,
359
+ "ELL": 291,
360
+ "BER": 292,
361
+ "EYĠ": 293,
362
+ "NE": 294,
363
+ "SS": 295,
364
+ "TĠAHĠN</w>": 296,
365
+ "ENT": 297,
366
+ "TĠIHĠD</w>": 298,
367
+ "VĠERĠ": 299,
368
+ "OS</w>": 300,
369
+ "NĠAHĠS</w>": 301,
370
+ "DS</w>": 302,
371
+ "FOR": 303,
372
+ "MAR": 304,
373
+ "SĠIHĠZ</w>": 305,
374
+ "HE": 306,
375
+ "PER": 307,
376
+ "UW</w>": 308,
377
+ "KĠIHĠ": 309,
378
+ "MAN": 310,
379
+ "UT": 311,
380
+ "CH</w>": 312,
381
+ "IS</w>": 313,
382
+ "VĠAHĠ": 314,
383
+ "LĠD</w>": 315,
384
+ "MĠZ</w>": 316,
385
+ "HHĠEHĠ": 317,
386
+ "KĠAHĠNĠ": 318,
387
+ "TER": 319,
388
+ "CAR": 320,
389
+ "NĠIY</w>": 321,
390
+ "PĠIHĠ": 322,
391
+ "RAN": 323,
392
+ "HĠAEĠ": 324,
393
+ "OT": 325,
394
+ "TĠEHĠ": 326,
395
+ "ZĠAHĠ": 327,
396
+ "ĠAOĠR": 328,
397
+ "WĠAAĠ": 329,
398
+ "HHĠAEĠ": 330,
399
+ "BĠAAĠ": 331,
400
+ "DĠEHĠ": 332,
401
+ "MĠAHĠN</w>": 333,
402
+ "WH": 334,
403
+ "OĠRĠ": 335,
404
+ "INGS</w>": 336,
405
+ "PĠAEĠ": 337,
406
+ "BĠEHĠ": 338,
407
+ "SĠIYĠ": 339,
408
+ "LĠUWĠ": 340,
409
+ "JHĠAHĠ": 341,
410
+ "NĠAAĠ": 342,
411
+ "PĠEHĠ": 343,
412
+ "ĠAY": 344,
413
+ "BĠAEĠ": 345,
414
+ "IES</w>": 346,
415
+ "PĠERĠ": 347,
416
+ "AP": 348,
417
+ "EX": 349,
418
+ "TĠAEĠ": 350,
419
+ "ZĠIHĠ": 351,
420
+ "EST</w>": 352,
421
+ "ĠEHĠRĠAHĠ": 353,
422
+ "ĠIHĠNGĠZ</w>": 354,
423
+ "DĠAAĠ": 355,
424
+ "IA</w>": 356,
425
+ "WA": 357,
426
+ "JHĠIHĠ": 358,
427
+ "FĠR": 359,
428
+ "IZ": 360,
429
+ "ĠIYĠAH</w>": 361,
430
+ "ILL": 362,
431
+ "IV": 363,
432
+ "NĠAH</w>": 364,
433
+ "OD": 365,
434
+ "SK": 366,
435
+ "TĠERĠ": 367,
436
+ "ANT": 368,
437
+ "DĠR": 369,
438
+ "EST": 370,
439
+ "OG": 371,
440
+ "UW": 372,
441
+ "ĠEY</w>": 373,
442
+ "ANS</w>": 374,
443
+ "ENT</w>": 375,
444
+ "MĠPĠ": 376,
445
+ "AAĠRĠ": 377,
446
+ "EC": 378,
447
+ "MĠEYĠ": 379,
448
+ "ETT": 380,
449
+ "SHĠIHĠ": 381,
450
+ "GH": 382,
451
+ "PĠAAĠ": 383,
452
+ "TON</w>": 384,
453
+ "SĠTĠ": 385,
454
+ "DIS": 386,
455
+ "MP": 387,
456
+ "SĠAYĠ": 388,
457
+ "HĠAAĠ": 389,
458
+ "BE": 390,
459
+ "GU": 391,
460
+ "PAR": 392,
461
+ "RĠEHĠ": 393,
462
+ "SĠTĠR": 394,
463
+ "CHĠIHĠ": 395,
464
+ "BL": 396,
465
+ "HAR": 397,
466
+ "NĠEHĠ": 398,
467
+ "PRO": 399,
468
+ "FĠEHĠ": 400,
469
+ "LĠAHĠNĠ": 401,
470
+ "MĠAAĠRĠ": 402,
471
+ "RĠAAĠ": 403,
472
+ "TĠEYĠ": 404,
473
+ "BUR": 405,
474
+ "DĠAEĠ": 406,
475
+ "JH</w>": 407,
476
+ "INS</w>": 408,
477
+ "ATION</w>": 409,
478
+ "AK": 410,
479
+ "DĠERĠ": 411,
480
+ "MON": 412,
481
+ "PO": 413,
482
+ "PRE": 414,
483
+ "RĠAEĠ": 415,
484
+ "KĠOWĠ": 416,
485
+ "JHĠEHĠ": 417,
486
+ "OR</w>": 418,
487
+ "SI": 419,
488
+ "TĠAAĠ": 420,
489
+ "WĠERĠ": 421,
490
+ "FĠERĠ": 422,
491
+ "NI": 423,
492
+ "WĠEHĠ": 424,
493
+ "END": 425,
494
+ "CHĠAHĠ": 426,
495
+ "FĠAOĠRĠ": 427,
496
+ "MĠIYĠ": 428,
497
+ "SĠAAĠ": 429,
498
+ "TĠER</w>": 430,
499
+ "BAR": 431,
500
+ "EG": 432,
501
+ "EV": 433,
502
+ "HHĠAHĠ": 434,
503
+ "LAN": 435,
504
+ "TĠIHĠNG</w>": 436,
505
+ "LĠOWĠ": 437,
506
+ "SĠTĠAHĠ": 438,
507
+ "CI": 439,
508
+ "COR": 440,
509
+ "DĠER</w>": 441,
510
+ "GĠAAĠ": 442,
511
+ "LĠAOĠ": 443,
512
+ "MO": 444,
513
+ "TĠERĠZ</w>": 445,
514
+ "UD": 446,
515
+ "SĠTĠIHĠ": 447,
516
+ "OUR": 448,
517
+ "BĠAHĠL</w>": 449,
518
+ "DHĠ": 450,
519
+ "HA": 451,
520
+ "MĠAHĠNĠ": 452,
521
+ "ĠEYĠSHĠAHĠN</w>": 453,
522
+ "DĠEYĠ": 454,
523
+ "FI": 455,
524
+ "KĠAAĠRĠ": 456,
525
+ "LĠER</w>": 457,
526
+ "SĠIY</w>": 458,
527
+ "TĠIYĠ": 459,
528
+ "OUS": 460,
529
+ "ESS": 461,
530
+ "AST": 462,
531
+ "BR": 463,
532
+ "DER": 464,
533
+ "EL</w>": 465,
534
+ "KĠIY</w>": 466,
535
+ "KĠAAĠNĠ": 467,
536
+ "TĠAYĠ": 468,
537
+ "WĠEYĠ": 469,
538
+ "ENS</w>": 470,
539
+ "ATH": 471,
540
+ "ITY</w>": 472,
541
+ "EP": 473,
542
+ "IST": 474,
543
+ "KS</w>": 475,
544
+ "LĠIHĠNG</w>": 476,
545
+ "NĠAYĠ": 477,
546
+ "WĠUHĠ": 478,
547
+ "KĠAHĠL</w>": 479,
548
+ "HHĠAAĠRĠ": 480,
549
+ "IY</w>": 481,
550
+ "KĠAH</w>": 482,
551
+ "LĠAH</w>": 483,
552
+ "LĠIYĠZ</w>": 484,
553
+ "NG</w>": 485,
554
+ "NĠAEĠ": 486,
555
+ "SĠAHĠN</w>": 487,
556
+ "INE</w>": 488,
557
+ "ĠERĠAHĠ": 489,
558
+ "GĠIHĠ": 490,
559
+ "KĠAOĠRĠ": 491,
560
+ "SĠAEĠ": 492,
561
+ "ĠIYĠAHĠN</w>": 493,
562
+ "ISH": 494,
563
+ "GĠEYĠ": 495,
564
+ "KĠERĠ": 496,
565
+ "MĠAAĠNĠ": 497,
566
+ "TA": 498,
567
+ "WĠAOĠ": 499,
568
+ "ĠAYĠAHĠ": 500,
569
+ "MĠP": 501,
570
+ "PĠS</w>": 502,
571
+ "MAN</w>": 503,
572
+ "BU": 504,
573
+ "EY</w>": 505,
574
+ "KĠEYĠ": 506,
575
+ "LĠAEĠNĠ": 507,
576
+ "MOR": 508,
577
+ "TĠAH</w>": 509,
578
+ "TĠRĠAHĠ": 510,
579
+ "RĠZ</w>": 511
580
+ },
581
+ "merges": [
582
+ "H Ġ",
583
+ "Ġ A",
584
+ "Ġ I",
585
+ "ĠA HĠ",
586
+ "Ġ E",
587
+ "ĠI HĠ",
588
+ "Y Ġ",
589
+ "R Ġ",
590
+ "N Ġ",
591
+ "A Ġ",
592
+ "W Ġ",
593
+ "E Ġ",
594
+ "ĠA AĠ",
595
+ "ĠE HĠ",
596
+ "S Ġ",
597
+ "ĠA EĠ",
598
+ "I N",
599
+ "Ġ R",
600
+ "E R",
601
+ "ĠI YĠ",
602
+ "L Ġ",
603
+ "K Ġ",
604
+ "ĠE RĠ",
605
+ "HĠ A",
606
+ "ĠI Y</w>",
607
+ "O WĠ",
608
+ "A N",
609
+ "A R",
610
+ "ĠE YĠ",
611
+ "O Ġ",
612
+ "T Ġ",
613
+ "E N",
614
+ "O N",
615
+ "G Ġ",
616
+ "ĠAHĠ NĠ",
617
+ "E S</w>",
618
+ "ĠA OĠ",
619
+ "A T",
620
+ "U WĠ",
621
+ "ĠIHĠ N",
622
+ "O R",
623
+ "M Ġ",
624
+ "I HĠ",
625
+ "A L",
626
+ "ĠA YĠ",
627
+ "E L",
628
+ "ĠA H</w>",
629
+ "S T",
630
+ "D Ġ",
631
+ "E D</w>",
632
+ "C H",
633
+ "ĠE R</w>",
634
+ "HĠA HĠ",
635
+ "ĠAHĠ N</w>",
636
+ "R E",
637
+ "SĠ T",
638
+ "ĠI Y",
639
+ "E R</w>",
640
+ "I L",
641
+ "E HĠ",
642
+ "I S",
643
+ "P Ġ",
644
+ "I T",
645
+ "B Ġ",
646
+ "O U",
647
+ "E S",
648
+ "IN G</w>",
649
+ "ĠIHĠN G</w>",
650
+ "L ĠAHĠ",
651
+ "H HĠ",
652
+ "A HĠ",
653
+ "E T",
654
+ "O W</w>",
655
+ "O L",
656
+ "N ĠAHĠ",
657
+ "R A",
658
+ "I C",
659
+ "J HĠ",
660
+ "T ĠIHĠ",
661
+ "ĠE R",
662
+ "L ĠIY</w>",
663
+ "L ĠIHĠ",
664
+ "U N",
665
+ "ĠAAĠ RĠ",
666
+ "R O",
667
+ "K ĠAHĠ",
668
+ "S ĠIHĠ",
669
+ "TĠ S</w>",
670
+ "ĠERĠ Z</w>",
671
+ "O N</w>",
672
+ "H HĠA",
673
+ "L A",
674
+ "ĠAOĠ RĠ",
675
+ "D ĠIHĠ",
676
+ "S HĠ",
677
+ "ĠIYĠ Z</w>",
678
+ "S ĠAHĠ",
679
+ "M A",
680
+ "R I",
681
+ "C HĠ",
682
+ "ER S</w>",
683
+ "L E",
684
+ "NĠ Z</w>",
685
+ "T ĠAHĠ",
686
+ "S HĠAHĠ",
687
+ "U R",
688
+ "F Ġ",
689
+ "U HĠ",
690
+ "A M",
691
+ "ĠAEĠ NĠ",
692
+ "S H",
693
+ "A EĠ",
694
+ "ĠEHĠ R",
695
+ "IN G",
696
+ "ĠAAĠ NĠ",
697
+ "L I",
698
+ "ĠAHĠNĠ Z</w>",
699
+ "ĠR ĠAHĠ",
700
+ "LĠ Z</w>",
701
+ "T H",
702
+ "T I",
703
+ "D ĠAHĠ",
704
+ "ĠEHĠ NĠ",
705
+ "B ĠAHĠ",
706
+ "C O",
707
+ "M ĠAHĠ",
708
+ "L Y</w>",
709
+ "ĠIHĠN GĠ",
710
+ "D E",
711
+ "ĠR ĠIHĠ",
712
+ "L O",
713
+ "M ĠIHĠ",
714
+ "N GĠ",
715
+ "U L",
716
+ "N ĠIHĠ",
717
+ "ON S</w>",
718
+ "A B",
719
+ "T HĠ",
720
+ "W ĠIHĠ",
721
+ "A D",
722
+ "I N</w>",
723
+ "Q U",
724
+ "A AĠ",
725
+ "A S",
726
+ "K ĠAAĠ",
727
+ "Z Ġ",
728
+ "DĠ Z</w>",
729
+ "I R",
730
+ "P ĠAHĠ",
731
+ "SĠ T</w>",
732
+ "A C",
733
+ "R U",
734
+ "AT I",
735
+ "E M",
736
+ "SĠ K",
737
+ "ĠA WĠ",
738
+ "KĠ S</w>",
739
+ "E D",
740
+ "L E</w>",
741
+ "L ĠAEĠ",
742
+ "T S</w>",
743
+ "Y S</w>",
744
+ "I YĠ",
745
+ "V ĠIHĠ",
746
+ "E N</w>",
747
+ "O W",
748
+ "P ĠR",
749
+ "V Ġ",
750
+ "YĠ UWĠ",
751
+ "E RĠ",
752
+ "O O",
753
+ "A G",
754
+ "U S</w>",
755
+ "K ĠAEĠ",
756
+ "R ĠAHĠ",
757
+ "C ON",
758
+ "I D",
759
+ "B ĠERĠ",
760
+ "M ĠAEĠ",
761
+ "ĠA O",
762
+ "I G",
763
+ "L ĠIYĠ",
764
+ "Y ĠAHĠ",
765
+ "S ĠEHĠ",
766
+ "OWĠ Z</w>",
767
+ "B ĠIHĠ",
768
+ "ĠIY ĠAHĠ",
769
+ "I M",
770
+ "G ĠR",
771
+ "O M",
772
+ "S E",
773
+ "AN D",
774
+ "A S</w>",
775
+ "L ĠEHĠ",
776
+ "A U",
777
+ "IHĠ NĠ",
778
+ "C K",
779
+ "T ĠR",
780
+ "P P",
781
+ "ES S</w>",
782
+ "L ĠEYĠ",
783
+ "V ER",
784
+ "G ĠAHĠ",
785
+ "T ĠIY</w>",
786
+ "AHĠ NĠ",
787
+ "F ĠAHĠ",
788
+ "L ĠAYĠ",
789
+ "L ĠAAĠ",
790
+ "T H</w>",
791
+ "R ĠIYĠ",
792
+ "S U",
793
+ "A N</w>",
794
+ "R ĠIHĠ",
795
+ "S C",
796
+ "RĠ OWĠ",
797
+ "KĠ W",
798
+ "B O",
799
+ "F ĠIHĠ",
800
+ "SĠ P",
801
+ "P H",
802
+ "S P",
803
+ "A V",
804
+ "F F",
805
+ "B ĠR",
806
+ "ĠEYĠ SHĠAHĠ",
807
+ "OWĠ LĠ",
808
+ "A L</w>",
809
+ "ĠAHĠNĠ T</w>",
810
+ "K ĠR",
811
+ "M ĠEHĠ",
812
+ "HHĠA AĠ",
813
+ "D I",
814
+ "O YĠ",
815
+ "S H</w>",
816
+ "M ĠAAĠ",
817
+ "U M",
818
+ "RĠ UWĠ",
819
+ "EL L",
820
+ "B ER",
821
+ "E YĠ",
822
+ "N E",
823
+ "S S",
824
+ "T ĠAHĠN</w>",
825
+ "EN T",
826
+ "TĠIHĠ D</w>",
827
+ "V ĠERĠ",
828
+ "O S</w>",
829
+ "NĠAHĠ S</w>",
830
+ "D S</w>",
831
+ "F OR",
832
+ "M AR",
833
+ "SĠIHĠ Z</w>",
834
+ "H E",
835
+ "P ER",
836
+ "U W</w>",
837
+ "K ĠIHĠ",
838
+ "M AN",
839
+ "U T",
840
+ "C H</w>",
841
+ "I S</w>",
842
+ "V ĠAHĠ",
843
+ "LĠ D</w>",
844
+ "MĠ Z</w>",
845
+ "HHĠ EHĠ",
846
+ "K ĠAHĠNĠ",
847
+ "T ER",
848
+ "C AR",
849
+ "N ĠIY</w>",
850
+ "P ĠIHĠ",
851
+ "R AN",
852
+ "HĠA EĠ",
853
+ "O T",
854
+ "T ĠEHĠ",
855
+ "Z ĠAHĠ",
856
+ "ĠAO ĠR",
857
+ "W ĠAAĠ",
858
+ "HHĠA EĠ",
859
+ "B ĠAAĠ",
860
+ "D ĠEHĠ",
861
+ "M ĠAHĠN</w>",
862
+ "W H",
863
+ "OĠ RĠ",
864
+ "ING S</w>",
865
+ "P ĠAEĠ",
866
+ "B ĠEHĠ",
867
+ "S ĠIYĠ",
868
+ "LĠ UWĠ",
869
+ "J HĠAHĠ",
870
+ "N ĠAAĠ",
871
+ "P ĠEHĠ",
872
+ "ĠA Y",
873
+ "B ĠAEĠ",
874
+ "I ES</w>",
875
+ "P ĠERĠ",
876
+ "A P",
877
+ "E X",
878
+ "T ĠAEĠ",
879
+ "Z ĠIHĠ",
880
+ "ES T</w>",
881
+ "ĠEHĠR ĠAHĠ",
882
+ "ĠIHĠNGĠ Z</w>",
883
+ "D ĠAAĠ",
884
+ "I A</w>",
885
+ "W A",
886
+ "JHĠ IHĠ",
887
+ "F ĠR",
888
+ "I Z",
889
+ "ĠIY ĠAH</w>",
890
+ "IL L",
891
+ "I V",
892
+ "N ĠAH</w>",
893
+ "O D",
894
+ "S K",
895
+ "T ĠERĠ",
896
+ "AN T",
897
+ "D ĠR",
898
+ "E ST",
899
+ "O G",
900
+ "U W",
901
+ "ĠE Y</w>",
902
+ "AN S</w>",
903
+ "EN T</w>",
904
+ "MĠ PĠ",
905
+ "AAĠ RĠ",
906
+ "E C",
907
+ "M ĠEYĠ",
908
+ "ET T",
909
+ "SHĠ IHĠ",
910
+ "G H",
911
+ "P ĠAAĠ",
912
+ "T ON</w>",
913
+ "SĠ TĠ",
914
+ "D IS",
915
+ "M P",
916
+ "S ĠAYĠ",
917
+ "HĠA AĠ",
918
+ "B E",
919
+ "G U",
920
+ "P AR",
921
+ "R ĠEHĠ",
922
+ "SĠT ĠR",
923
+ "CHĠ IHĠ",
924
+ "B L",
925
+ "H AR",
926
+ "N ĠEHĠ",
927
+ "P RO",
928
+ "F ĠEHĠ",
929
+ "L ĠAHĠNĠ",
930
+ "M ĠAAĠRĠ",
931
+ "R ĠAAĠ",
932
+ "T ĠEYĠ",
933
+ "B UR",
934
+ "D ĠAEĠ",
935
+ "J H</w>",
936
+ "IN S</w>",
937
+ "ATI ON</w>",
938
+ "A K",
939
+ "D ĠERĠ",
940
+ "M ON",
941
+ "P O",
942
+ "P RE",
943
+ "R ĠAEĠ",
944
+ "KĠ OWĠ",
945
+ "JHĠ EHĠ",
946
+ "O R</w>",
947
+ "S I",
948
+ "T ĠAAĠ",
949
+ "W ĠERĠ",
950
+ "F ĠERĠ",
951
+ "N I",
952
+ "W ĠEHĠ",
953
+ "EN D",
954
+ "C HĠAHĠ",
955
+ "F ĠAOĠRĠ",
956
+ "M ĠIYĠ",
957
+ "S ĠAAĠ",
958
+ "T ĠER</w>",
959
+ "B AR",
960
+ "E G",
961
+ "E V",
962
+ "H HĠAHĠ",
963
+ "L AN",
964
+ "T ĠIHĠNG</w>",
965
+ "LĠ OWĠ",
966
+ "SĠT ĠAHĠ",
967
+ "C I",
968
+ "C OR",
969
+ "D ĠER</w>",
970
+ "G ĠAAĠ",
971
+ "L ĠAOĠ",
972
+ "M O",
973
+ "T ĠERĠZ</w>",
974
+ "U D",
975
+ "SĠT ĠIHĠ",
976
+ "OU R",
977
+ "BĠAHĠ L</w>",
978
+ "D HĠ",
979
+ "H A",
980
+ "M ĠAHĠNĠ",
981
+ "ĠEYĠSHĠAHĠ N</w>",
982
+ "D ĠEYĠ",
983
+ "F I",
984
+ "K ĠAAĠRĠ",
985
+ "L ĠER</w>",
986
+ "S ĠIY</w>",
987
+ "T ĠIYĠ",
988
+ "OU S",
989
+ "ES S",
990
+ "A ST",
991
+ "B R",
992
+ "D ER",
993
+ "E L</w>",
994
+ "K ĠIY</w>",
995
+ "K ĠAAĠNĠ",
996
+ "T ĠAYĠ",
997
+ "W ĠEYĠ",
998
+ "EN S</w>",
999
+ "AT H",
1000
+ "IT Y</w>",
1001
+ "E P",
1002
+ "I ST",
1003
+ "K S</w>",
1004
+ "L ĠIHĠNG</w>",
1005
+ "N ĠAYĠ",
1006
+ "WĠ UHĠ",
1007
+ "KĠAHĠ L</w>",
1008
+ "HHĠAAĠ RĠ",
1009
+ "I Y</w>",
1010
+ "K ĠAH</w>",
1011
+ "L ĠAH</w>",
1012
+ "L ĠIYĠZ</w>",
1013
+ "N G</w>",
1014
+ "N ĠAEĠ",
1015
+ "S ĠAHĠN</w>",
1016
+ "IN E</w>",
1017
+ "ĠER ĠAHĠ",
1018
+ "G ĠIHĠ",
1019
+ "K ĠAOĠRĠ",
1020
+ "S ĠAEĠ",
1021
+ "ĠIY ĠAHĠN</w>",
1022
+ "IS H",
1023
+ "G ĠEYĠ",
1024
+ "K ĠERĠ",
1025
+ "M ĠAAĠNĠ",
1026
+ "T A",
1027
+ "W ĠAOĠ",
1028
+ "ĠA YĠAHĠ",
1029
+ "MĠ P",
1030
+ "PĠ S</w>",
1031
+ "MA N</w>",
1032
+ "B U",
1033
+ "E Y</w>",
1034
+ "K ĠEYĠ",
1035
+ "L ĠAEĠNĠ",
1036
+ "M OR",
1037
+ "T ĠAH</w>",
1038
+ "T ĠRĠAHĠ",
1039
+ "RĠ Z</w>"
1040
+ ]
1041
+ }
1042
+ }
G2P_lexicon/my_tokenizer/my_dict_256.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "0": "<sos>",
3
+ "1": "<eos>",
4
+ "2": "<unk>",
5
+ "3": "<pad>",
6
+ "4": "AA1",
7
+ "5": "UW",
8
+ "6": "ER0",
9
+ "7": "F",
10
+ "8": "CH",
11
+ "9": "S",
12
+ "10": "AO1",
13
+ "11": "DH",
14
+ "12": "TH",
15
+ "13": "IY",
16
+ "14": "OW",
17
+ "15": "AH2",
18
+ "16": "W",
19
+ "17": "AH1",
20
+ "18": "AO",
21
+ "19": "D",
22
+ "20": "AW1",
23
+ "21": "OY2",
24
+ "22": "AO0",
25
+ "23": "EY0",
26
+ "24": "AH",
27
+ "25": "AE",
28
+ "26": "UH2",
29
+ "27": "OW2",
30
+ "28": "UW0",
31
+ "29": "UW1",
32
+ "30": "UH1",
33
+ "31": "ER",
34
+ "32": "EH2",
35
+ "33": "UW2",
36
+ "34": "ER2",
37
+ "35": "OY",
38
+ "36": "AE0",
39
+ "37": "AY",
40
+ "38": "K",
41
+ "39": "AA0",
42
+ "40": "T",
43
+ "41": "EH0",
44
+ "42": "SH",
45
+ "43": "ER1",
46
+ "44": "G",
47
+ "45": "EY",
48
+ "46": "AH0",
49
+ "47": "IH0",
50
+ "48": "L",
51
+ "49": "AE2",
52
+ "50": "B",
53
+ "51": "OY0",
54
+ "52": "EH",
55
+ "53": "AA2",
56
+ "54": "IH",
57
+ "55": "M",
58
+ "56": "AY0",
59
+ "57": "UH",
60
+ "58": "EY2",
61
+ "59": "IY2",
62
+ "60": "EY1",
63
+ "61": "HH",
64
+ "62": "P",
65
+ "63": "AE1",
66
+ "64": "OW1",
67
+ "65": "R",
68
+ "66": "IH1",
69
+ "67": "Z",
70
+ "68": "IH2",
71
+ "69": "IY0",
72
+ "70": "V",
73
+ "71": "JH",
74
+ "72": "OY1",
75
+ "73": "Y",
76
+ "74": "N",
77
+ "75": "AO2",
78
+ "76": "AW",
79
+ "77": "UH0",
80
+ "78": "IY1",
81
+ "79": "AW0",
82
+ "80": "AA",
83
+ "81": "NG",
84
+ "82": "AY1",
85
+ "83": "EH1",
86
+ "84": "AY2",
87
+ "85": "OW0",
88
+ "86": "AW2",
89
+ "87": "ZH"
90
+ }
G2P_lexicon/sp_tokenizer.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+
4
+ class Tokenizer_sp:
5
+ def __init__(self, config: dict = None, srs: bool = True, dict_path=None, text=None):
6
+ if config is None:
7
+ config = {}
8
+
9
+ self.sos = config.get('BOS_TOKEN', '<sos>')
10
+ self.eos = config.get('EOS_TOKEN', '<eos>')
11
+ self.unk = config.get('UNK_TOKEN', '<unk>')
12
+ self.pad = config.get('PAD_TOKEN', '<pad>')
13
+ self.tokens = []
14
+ self.srs = srs
15
+
16
+ if dict_path:
17
+ self.load_dict_from_file(dict_path)
18
+ elif text:
19
+ self.create_tokenizer(text)
20
+ else:
21
+ raise ValueError("Текстов нет")
22
+
23
+ def create_tokenizer(self, texts):
24
+ tokens = []
25
+
26
+ for phonemes_list in texts:
27
+ for phoneme in phonemes_list:
28
+ tokens.append(phoneme)
29
+
30
+ self.tokens = [self.sos, self.eos, self.unk, self.pad] + list(set(tokens))
31
+
32
+ self.token2idx = {token: int(i) for i, token in enumerate(self.tokens)}
33
+ self.idx2token = {int(i): token for i, token in enumerate(self.tokens)}
34
+
35
+ self.unk_idx = self.token2idx[self.unk]
36
+ self.sos_idx = self.token2idx[self.sos]
37
+ self.eos_idx = self.token2idx[self.eos]
38
+ self.pad_idx = self.token2idx[self.pad]
39
+
40
+ def load_dict_from_file(self, file_path):
41
+ with open(file_path, 'r') as file:
42
+ data = json.load(file)
43
+
44
+ self.idx2token = {int(token): idx for token, idx in data.items()}
45
+ self.token2idx = {idx: int(token) for token, idx in self.idx2token.items()}
46
+
47
+ self.unk_idx = self.token2idx.get(self.unk)
48
+ self.sos_idx = self.token2idx.get(self.sos)
49
+ self.eos_idx = self.token2idx.get(self.eos)
50
+ self.pad_idx = self.token2idx.get(self.pad)
51
+
52
+ def tokenize(self, text):
53
+ if not self.srs:
54
+ tokens = []
55
+ for tok in text:
56
+ if tok in self.token2idx:
57
+ tokens.append(tok)
58
+ else:
59
+ tokens.append(self.unk_idx)
60
+ return [self.sos] + tokens + [self.eos]
61
+ else:
62
+ return [self.sos] + list(text) + [self.eos]
63
+
64
+ def convert_tokens_to_idx(self, tokens):
65
+ idx_list = [self.token2idx.get(tok, self.unk_idx) for tok in tokens]
66
+ return idx_list
67
+
68
+ def encode(self, text, seq_len=None):
69
+ tokens = self.tokenize(text)[:seq_len]
70
+ return self.convert_tokens_to_idx(tokens)
71
+
72
+ def decode(self, idx_list):
73
+ ans = []
74
+ for idx in idx_list:
75
+ try:
76
+ ans.append(self.idx2token[int(idx)])
77
+ except KeyError:
78
+ ans.append(self.idx2token[self.unk_idx])
79
+ return ans
80
+
81
+ def get_vocab_size(self):
82
+ return len(self.token2idx)
83
+
84
+
85
+ if __name__ == "__main__":
86
+ tokenizer_sp = Tokenizer_sp(dict_path='./my_tokenizer/my_dict_256.json')
87
+ print(tokenizer_sp.idx2token)
G2P_lexicon/transformer.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import math
5
+
6
+
7
+ class PositionalEncoding(nn.Module):
8
+ def __init__(self, d_model, max_seq_length):
9
+ super(PositionalEncoding, self).__init__()
10
+
11
+ pe = torch.zeros(max_seq_length, d_model)
12
+ position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
13
+ div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
14
+
15
+ pe[:, 0::2] = torch.sin(position * div_term)
16
+ pe[:, 1::2] = torch.cos(position * div_term)
17
+
18
+ self.register_buffer('pe', pe.unsqueeze(0))
19
+
20
+ def forward(self, x):
21
+ return x + self.pe[:, :x.size(1)]
22
+
23
+
24
+ class MultiHeadSelfAttention(nn.Module):
25
+ def __init__(self, d_model, num_heads):
26
+ super(MultiHeadSelfAttention, self).__init__()
27
+ assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
28
+
29
+ self.d_model = d_model
30
+ self.num_heads = num_heads
31
+ self.depth = d_model // num_heads
32
+
33
+ self.wq = nn.Linear(d_model, d_model)
34
+ self.wk = nn.Linear(d_model, d_model)
35
+ self.wv = nn.Linear(d_model, d_model)
36
+
37
+ self.fc = nn.Linear(d_model, d_model)
38
+
39
+ def split_heads(self, x, batch_size):
40
+ x = x.view(batch_size, -1, self.num_heads, self.depth)
41
+ return x.permute(0, 2, 1, 3)
42
+
43
+ def forward(self, q, k, v, mask=None):
44
+ batch_size = q.size(0)
45
+
46
+ q = self.split_heads(self.wq(q), batch_size)
47
+ k = self.split_heads(self.wk(k), batch_size)
48
+ v = self.split_heads(self.wv(v), batch_size)
49
+
50
+ scores = torch.matmul(q, k.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.depth, dtype=torch.float32))
51
+ if mask is not None:
52
+ scores = scores.masked_fill(mask == 0, -1e9)
53
+ attn = F.softmax(scores, dim=-1)
54
+
55
+ out = torch.matmul(attn, v)
56
+ out = out.permute(0, 2, 1, 3).contiguous()
57
+ out = out.view(batch_size, -1, self.d_model)
58
+
59
+ out = self.fc(out)
60
+ return out
61
+
62
+
63
+ class FeedForwardNetwork(nn.Module):
64
+ def __init__(self, d_model, d_ff, dropout=0.1):
65
+ super(FeedForwardNetwork, self).__init__()
66
+ self.fc1 = nn.Linear(d_model, d_ff)
67
+ self.fc2 = nn.Linear(d_ff, d_model)
68
+ self.dropout = nn.Dropout(dropout)
69
+
70
+ def forward(self, x):
71
+ x = self.fc1(x)
72
+ x = F.relu(x)
73
+ x = self.dropout(x)
74
+ x = self.fc2(x)
75
+ return x
76
+
77
+
78
+ class EncoderLayer(nn.Module):
79
+ def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
80
+ super(EncoderLayer, self).__init__()
81
+ self.self_attn = MultiHeadSelfAttention(d_model, num_heads)
82
+ self.ffn = FeedForwardNetwork(d_model, d_ff, dropout)
83
+
84
+ self.layernorm1 = nn.LayerNorm(d_model)
85
+ self.layernorm2 = nn.LayerNorm(d_model)
86
+ self.dropout = nn.Dropout(dropout)
87
+
88
+ def forward(self, x, mask=None):
89
+ attn_output = self.self_attn(x, x, x, mask)
90
+ x = self.layernorm1(x + self.dropout(attn_output))
91
+
92
+ ffn_output = self.ffn(x)
93
+ x = self.layernorm2(x + self.dropout(ffn_output))
94
+ return x
95
+
96
+
97
+ class DecoderLayer(nn.Module):
98
+ def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
99
+ super(DecoderLayer, self).__init__()
100
+ self.self_attn = MultiHeadSelfAttention(d_model, num_heads)
101
+ self.cross_attn = MultiHeadSelfAttention(d_model, num_heads)
102
+ self.ffn = FeedForwardNetwork(d_model, d_ff, dropout)
103
+
104
+ self.layernorm1 = nn.LayerNorm(d_model)
105
+ self.layernorm2 = nn.LayerNorm(d_model)
106
+ self.layernorm3 = nn.LayerNorm(d_model)
107
+ self.dropout = nn.Dropout(dropout)
108
+
109
+ def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
110
+ self_attn_output = self.self_attn(q=x, k=x, v=x, mask=tgt_mask)
111
+ x = self.layernorm1(x + self.dropout(self_attn_output))
112
+
113
+ cross_attn_output = self.cross_attn(q=x, k=enc_output, v=enc_output, mask=src_mask)
114
+ x = self.layernorm2(x + self.dropout(cross_attn_output))
115
+
116
+ ffn_output = self.ffn(x)
117
+ x = self.layernorm3(x + self.dropout(ffn_output))
118
+ return x
119
+
120
+
121
+ class TransformerBlock(nn.Module):
122
+ def __init__(self, tokenizer=None, config=None, stress=False):
123
+ super(TransformerBlock, self).__init__()
124
+
125
+ self.config = config
126
+ self.tokenizer = tokenizer
127
+ self.input_vocab_size = tokenizer.get_vocab_size()
128
+ self.target_vocab_size = tokenizer.get_vocab_size()
129
+ self.d_model = config.get('D_MODEL', 512)
130
+ self.num_heads = config.get('NUM_HEADS', 8)
131
+ self.num_encoder_layers = config.get('NUM', 6)
132
+ self.num_decoder_layers = config.get('NUM', 6)
133
+ self.d_ff = config.get('D_FF', 2048)
134
+ self.dropout = config.get('DROPOUT', 0.1)
135
+ self.stress = stress
136
+
137
+ self.encoder_embedding = nn.Embedding(self.input_vocab_size, self.d_model)
138
+ self.decoder_embedding = nn.Embedding(self.target_vocab_size, self.d_model)
139
+
140
+ self.pos_embedding = PositionalEncoding(self.d_model, config.get('MAX_LEN', 32))
141
+
142
+ self.encoder_layers = nn.ModuleList(
143
+ [EncoderLayer(self.d_model, self.num_heads, self.d_ff, self.dropout) for _ in
144
+ range(self.num_encoder_layers)])
145
+ self.decoder_layers = nn.ModuleList(
146
+ [DecoderLayer(self.d_model, self.num_heads, self.d_ff, self.dropout) for _ in
147
+ range(self.num_decoder_layers)])
148
+
149
+ self.fc_out = nn.Linear(self.d_model, self.target_vocab_size)
150
+
151
+ def encode(self, src, src_mask):
152
+ src = self.pos_embedding(self.encoder_embedding(src))
153
+ for layer in self.encoder_layers:
154
+ src = layer(src, src_mask)
155
+ return src
156
+
157
+ def decode(self, memory, src_mask, tgt, tgt_mask):
158
+ tgt = self.pos_embedding(self.decoder_embedding(tgt))
159
+ for layer in self.decoder_layers:
160
+ tgt = layer(tgt, memory, src_mask, tgt_mask)
161
+ return tgt
162
+
163
+ def forward(self, src, tgt, src_mask, tgt_mask):
164
+ memory = self.encode(src, src_mask)
165
+ output = self.decode(memory, src_mask, tgt, tgt_mask)
166
+ output = self.fc_out(output)
167
+ return output
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 NikiPshg
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,3 +1,45 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Grapheme to Phoneme (G2P) with Stress
2
+
3
+ This project provides a Grapheme to Phoneme (G2P) conversion tool that first checks the CMU Pronouncing Dictionary for phoneme translations. If a word is not found in the dictionary, it utilizes two Transformer-based models to generate phoneme translations and add stress markers. The output is in ARPAbet format, and the model can also convert graphemes into phoneme integer indices.
4
+
5
+ ## Features
6
+
7
+ 1. **CMU Pronouncing Dictionary Integration**: First checks the CMU dictionary for phoneme translations.
8
+ 2. **Transformer-Based Conversion**:
9
+ - **Phoneme Generation**: The first Transformer model converts graphemes into phonemes.
10
+ - **Stress Addition**: The second Transformer model adds stress markers to the phonemes.
11
+ 3. **ARPAbet Output**: Outputs phonemes in ARPAbet format.
12
+ 4. **Phoneme Integer Indices**: Converts graphemes to phoneme integer indices.
13
+
14
+ ## Installation
15
+
16
+ 1. Clone the repository:
17
+ ```sh
18
+ git clone https://github.com/NikiPshg/G2P_en_lex.git
19
+ cd G2P_en_lex
20
+ ```
21
+
22
+ 2. Install the required dependencies:
23
+ ```sh
24
+ pip install -r requiremenst.txt
25
+ ```
26
+
27
+
28
+ ### Example
29
+
30
+ ```python
31
+ from G2P_lexicon import g2p_en_lexicon
32
+
33
+ # Initialize the G2P converter
34
+ g2p_converter = g2p_en_lexicon()
35
+
36
+ # Convert a word to phonemes
37
+ text = "text, numbers, and some strange symbols !№;% 21"
38
+ phonemes = G2P_en_lex(text, with_stress=False)
39
+ ['T', 'EH', 'K', 'S', 'T', ' ', ',', ' ', 'N', 'AH', 'M', 'B', 'ER', 'Z', ' ', ',', ' ',
40
+ 'AH', 'N', 'D', ' ', 'S', 'AH', 'M', ' ', 'S', 'T', 'R', 'EY', 'N', 'JH', ' ',
41
+ 'S', 'IH', 'M', 'B', 'AH', 'L', 'Z', ' ',
42
+ 'T', 'W', 'EH', 'N', 'IY', 'W', 'AH', 'N']
43
+
44
+
45
+
requiremenst.txt ADDED
Binary file (884 Bytes). View file
 
test.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from G2P_lexicon import g2p_en_lexicon
2
+ text = "text, numbers, and some strange symbols !№;% 21"
3
+ g2p = g2p_en_lexicon()
4
+ phonemes = g2p(text, with_stress=False)
5
+ print(phonemes)