File size: 2,405 Bytes
9ba7d3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from G2P_lexicon.G2P import G2P
from G2P_lexicon.SP import SP
from G2P_lexicon.data_preparation import preprocess_text
import string
import json
import time
import os


dirname = os.path.dirname(__file__)
json_path = os.path.join(dirname, "data/word2phoneme.json")

with open(json_path) as json_file:
    phoneme2grapheme_dict = json.load(json_file)


class g2p_en_lexicon:
    def __init__(self):
        self.G2P = G2P
        self.SP = SP

    def cleaan_stress(self, seq: list):
        return [phoneme[:-1] if phoneme[-1].isdigit() else phoneme for phoneme in seq]

    def pred_with_stress(self, seq):
        return self.SP(self.G2P(seq))

    def check_punctuation(self, word):
        return any(char in string.punctuation for char in word)

    def __call__(self, seq, with_stress=True):
        seq_list = preprocess_text(seq)
        result = []
        count_from_dict = 0
        count_from_model = 0
        for word in seq_list:
            phonemes_from_dict = phoneme2grapheme_dict.get(word)
            if phonemes_from_dict is None:
                if self.check_punctuation(word):
                    result.extend([word] + [' '])
                else:
                    count_from_model += 1
                    if with_stress:
                        pred_stress = self.pred_with_stress(word)
                        #print(f"{word} -- {pred_stress}")
                        result.extend(pred_stress + [' '])
                    else:
                        pred_without = self.G2P(word)
                        #print(f"{word} -- {pred_without}")
                        result.extend(pred_without + [' '])
            else:
                count_from_dict += 1
                result.extend(phonemes_from_dict + [' '])

        #print(f"{count_from_dict} -- from json\n"
              #f"{count_from_model} -- from model")
        result = result[:-1] if result[-1] == ' ' else result

        if not with_stress:
            return self.cleaan_stress(result)
        return result


if __name__ == '__main__':
    G2P_en_lexicon = g2p_en_lexicon()
    text = """mtusi is the worst option for a programmer or a student"""
    start_time = time.time()
    print(G2P_en_lexicon(text))
    end_time = time.time()
    print(f"{(end_time - start_time) * 1000} мc -- за это была выполнена ")