hexgrad commited on
Commit
d82d4c2
·
verified ·
1 Parent(s): be929c2

Delete katsu.py

Browse files
Files changed (1) hide show
  1. katsu.py +0 -431
katsu.py DELETED
@@ -1,431 +0,0 @@
1
- # https://github.com/polm/cutlet/blob/master/cutlet/cutlet.py
2
- from dataclasses import dataclass
3
- from fugashi import Tagger
4
- from num2kana import Convert
5
- import mojimoji
6
- import re
7
- import unicodedata
8
-
9
- HEPBURN = {
10
- chr(12449):'a', #ァ
11
- chr(12450):'a', #ア
12
- chr(12451):'i', #ィ
13
- chr(12452):'i', #イ
14
- chr(12453):'ɯ', #ゥ
15
- chr(12454):'ɯ', #ウ
16
- chr(12455):'e', #ェ
17
- chr(12456):'e', #エ
18
- chr(12457):'o', #ォ
19
- chr(12458):'o', #オ
20
- chr(12459):'ka', #カ
21
- chr(12460):'ɡa', #ガ
22
- chr(12461):'ki', #キ
23
- chr(12462):'ɡi', #ギ
24
- chr(12463):'kɯ', #ク
25
- chr(12464):'ɡɯ', #グ
26
- chr(12465):'ke', #ケ
27
- chr(12466):'ɡe', #ゲ
28
- chr(12467):'ko', #コ
29
- chr(12468):'ɡo', #ゴ
30
- chr(12469):'sa', #サ
31
- chr(12470):'za', #ザ
32
- chr(12471):'ɕi', #シ
33
- chr(12472):'dʑi', #ジ
34
- chr(12473):'sɨ', #ス
35
- chr(12474):'zɨ', #ズ
36
- chr(12475):'se', #セ
37
- chr(12476):'ze', #ゼ
38
- chr(12477):'so', #ソ
39
- chr(12478):'zo', #ゾ
40
- chr(12479):'ta', #タ
41
- chr(12480):'da', #ダ
42
- chr(12481):'tɕi', #チ
43
- chr(12482):'dʑi', #ヂ
44
- # chr(12483) #ッ
45
- chr(12484):'tsɨ', #ツ
46
- chr(12485):'zɨ', #ヅ
47
- chr(12486):'te', #テ
48
- chr(12487):'de', #デ
49
- chr(12488):'to', #ト
50
- chr(12489):'do', #ド
51
- chr(12490):'na', #ナ
52
- chr(12491):'ɲi', #ニ
53
- chr(12492):'nɯ', #ヌ
54
- chr(12493):'ne', #ネ
55
- chr(12494):'no', #ノ
56
- chr(12495):'ha', #ハ
57
- chr(12496):'ba', #バ
58
- chr(12497):'pa', #パ
59
- chr(12498):'çi', #ヒ
60
- chr(12499):'bi', #ビ
61
- chr(12500):'pi', #ピ
62
- chr(12501):'ɸɯ', #フ
63
- chr(12502):'bɯ', #ブ
64
- chr(12503):'pɯ', #プ
65
- chr(12504):'he', #ヘ
66
- chr(12505):'be', #ベ
67
- chr(12506):'pe', #ペ
68
- chr(12507):'ho', #ホ
69
- chr(12508):'bo', #ボ
70
- chr(12509):'po', #ポ
71
- chr(12510):'ma', #マ
72
- chr(12511):'mi', #ミ
73
- chr(12512):'mɯ', #ム
74
- chr(12513):'me', #メ
75
- chr(12514):'mo', #モ
76
- chr(12515):'ja', #ャ
77
- chr(12516):'ja', #ヤ
78
- chr(12517):'jɯ', #ュ
79
- chr(12518):'jɯ', #ユ
80
- chr(12519):'jo', #ョ
81
- chr(12520):'jo', #ヨ
82
- chr(12521):'ra', #ラ
83
- chr(12522):'ri', #リ
84
- chr(12523):'rɯ', #ル
85
- chr(12524):'re', #レ
86
- chr(12525):'ro', #ロ
87
- chr(12526):'wa', #ヮ
88
- chr(12527):'wa', #ワ
89
- chr(12528):'i', #ヰ
90
- chr(12529):'e', #ヱ
91
- chr(12530):'o', #ヲ
92
- # chr(12531) #ン
93
- chr(12532):'vɯ', #ヴ
94
- chr(12533):'ka', #ヵ
95
- chr(12534):'ke', #ヶ
96
- }
97
- assert len(HEPBURN) == 84 and all(i in {12483, 12531} or chr(i) in HEPBURN for i in range(12449, 12535))
98
-
99
- for k, v in list(HEPBURN.items()):
100
- HEPBURN[chr(ord(k)-96)] = v
101
- assert len(HEPBURN) == 84*2
102
-
103
- HEPBURN.update({
104
- chr(12535):'va', #ヷ
105
- chr(12536):'vi', #ヸ
106
- chr(12537):'ve', #ヹ
107
- chr(12538):'vo', #ヺ
108
- })
109
- assert len(HEPBURN) == 84*2+4 and all(chr(i) in HEPBURN for i in range(12535, 12539))
110
-
111
- HEPBURN.update({
112
- chr(12784):'kɯ', #ㇰ
113
- chr(12785):'ɕi', #ㇱ
114
- chr(12786):'sɨ', #ㇲ
115
- chr(12787):'to', #ㇳ
116
- chr(12788):'nɯ', #ㇴ
117
- chr(12789):'ha', #ㇵ
118
- chr(12790):'çi', #ㇶ
119
- chr(12791):'ɸɯ', #ㇷ
120
- chr(12792):'he', #ㇸ
121
- chr(12793):'ho', #ㇹ
122
- chr(12794):'mɯ', #ㇺ
123
- chr(12795):'ra', #ㇻ
124
- chr(12796):'ri', #ㇼ
125
- chr(12797):'rɯ', #ㇽ
126
- chr(12798):'re', #ㇾ
127
- chr(12799):'ro', #ㇿ
128
- })
129
- assert len(HEPBURN) == 84*2+4+16 and all(chr(i) in HEPBURN for i in range(12784, 12800))
130
-
131
- HEPBURN.update({
132
- chr(12452)+chr(12455):'je', #イェ
133
- chr(12454)+chr(12451):'wi', #ウィ
134
- chr(12454)+chr(12455):'we', #ウェ
135
- chr(12454)+chr(12457):'wo', #ウォ
136
- chr(12461)+chr(12455):'kʲe', #キェ
137
- chr(12461)+chr(12515):'kʲa', #キャ
138
- chr(12461)+chr(12517):'kʲɨ', #キュ
139
- chr(12461)+chr(12519):'kʲo', #キョ
140
- chr(12462)+chr(12515):'ɡʲa', #ギャ
141
- chr(12462)+chr(12517):'ɡʲɨ', #ギュ
142
- chr(12462)+chr(12519):'ɡʲo', #ギョ
143
- chr(12463)+chr(12449):'kʷa', #クァ
144
- chr(12463)+chr(12451):'kʷi', #クィ
145
- chr(12463)+chr(12455):'kʷe', #クェ
146
- chr(12463)+chr(12457):'kʷo', #クォ
147
- chr(12464)+chr(12449):'ɡʷa', #グァ
148
- chr(12464)+chr(12451):'ɡʷi', #グィ
149
- chr(12464)+chr(12455):'ɡʷe', #グェ
150
- chr(12464)+chr(12457):'ɡʷo', #グォ
151
- chr(12471)+chr(12455):'ɕe', #シェ
152
- chr(12471)+chr(12515):'ɕa', #シャ
153
- chr(12471)+chr(12517):'ɕɨ', #シュ
154
- chr(12471)+chr(12519):'ɕo', #ショ
155
- chr(12472)+chr(12455):'dʑe', #ジェ
156
- chr(12472)+chr(12515):'dʑa', #ジャ
157
- chr(12472)+chr(12517):'dʑɨ', #ジュ
158
- chr(12472)+chr(12519):'dʑo', #ジョ
159
- chr(12481)+chr(12455):'tɕe', #チェ
160
- chr(12481)+chr(12515):'tɕa', #チャ
161
- chr(12481)+chr(12517):'tɕɨ', #チュ
162
- chr(12481)+chr(12519):'tɕo', #チョ
163
- chr(12482)+chr(12515):'dʑa', #ヂャ
164
- chr(12482)+chr(12517):'dʑɨ', #ヂュ
165
- chr(12482)+chr(12519):'dʑo', #ヂョ
166
- chr(12484)+chr(12449):'tsa', #ツァ
167
- chr(12484)+chr(12451):'tsi', #ツィ
168
- chr(12484)+chr(12455):'tse', #ツェ
169
- chr(12484)+chr(12457):'tso', #ツォ
170
- chr(12486)+chr(12451):'ti', #ティ
171
- chr(12486)+chr(12517):'tʲɨ', #テュ
172
- chr(12487)+chr(12451):'di', #ディ
173
- chr(12487)+chr(12517):'dʲɨ', #デュ
174
- chr(12488)+chr(12453):'tɯ', #トゥ
175
- chr(12489)+chr(12453):'dɯ', #ドゥ
176
- chr(12491)+chr(12455):'ɲe', #ニェ
177
- chr(12491)+chr(12515):'ɲa', #ニャ
178
- chr(12491)+chr(12517):'ɲɨ', #ニュ
179
- chr(12491)+chr(12519):'ɲo', #ニョ
180
- chr(12498)+chr(12455):'çe', #ヒェ
181
- chr(12498)+chr(12515):'ça', #ヒャ
182
- chr(12498)+chr(12517):'çɨ', #ヒュ
183
- chr(12498)+chr(12519):'ço', #ヒョ
184
- chr(12499)+chr(12515):'bʲa', #ビャ
185
- chr(12499)+chr(12517):'bʲɨ', #ビュ
186
- chr(12499)+chr(12519):'bʲo', #ビョ
187
- chr(12500)+chr(12515):'pʲa', #ピャ
188
- chr(12500)+chr(12517):'pʲɨ', #ピュ
189
- chr(12500)+chr(12519):'pʲo', #ピョ
190
- chr(12501)+chr(12449):'ɸa', #ファ
191
- chr(12501)+chr(12451):'ɸi', #フィ
192
- chr(12501)+chr(12455):'ɸe', #フェ
193
- chr(12501)+chr(12457):'ɸo', #フォ
194
- chr(12501)+chr(12517):'ɸʲɨ', #フュ
195
- chr(12501)+chr(12519):'ɸʲo', #フョ
196
- chr(12511)+chr(12515):'mʲa', #ミャ
197
- chr(12511)+chr(12517):'mʲɨ', #ミュ
198
- chr(12511)+chr(12519):'mʲo', #ミョ
199
- chr(12522)+chr(12515):'rʲa', #リャ
200
- chr(12522)+chr(12517):'rʲɨ', #リュ
201
- chr(12522)+chr(12519):'rʲo', #リョ
202
- chr(12532)+chr(12449):'va', #ヴァ
203
- chr(12532)+chr(12451):'vi', #ヴィ
204
- chr(12532)+chr(12455):'ve', #ヴェ
205
- chr(12532)+chr(12457):'vo', #ヴォ
206
- chr(12532)+chr(12517):'vʲɨ', #ヴュ
207
- chr(12532)+chr(12519):'vʲo', #ヴョ
208
- })
209
- assert len(HEPBURN) == 84*2+4+16+76
210
-
211
- for k, v in list(HEPBURN.items()):
212
- if len(k) != 2:
213
- continue
214
- a, b = k
215
- assert a in HEPBURN and b in HEPBURN, (a, b)
216
- a = chr(ord(a)-96)
217
- b = chr(ord(b)-96)
218
- assert a in HEPBURN and b in HEPBURN, (a, b)
219
- HEPBURN[a+b] = v
220
- assert len(HEPBURN) == 84*2+4+16+76*2
221
-
222
- HEPBURN.update({
223
- # symbols
224
- # 'ー': '-', # 長音符, only used when repeated
225
- '。': '.',
226
- '、': ',',
227
- '?': '?',
228
- '!': '!',
229
- '「': '"',
230
- '」': '"',
231
- '『': '"',
232
- '』': '"',
233
- ':': ':',
234
- ';': ';',
235
- '(': '(',
236
- ')': ')',
237
- '《': '(',
238
- '》': ')',
239
- '【': '[',
240
- '】': ']',
241
- '・': ' ',#'/',
242
- ',': ',',
243
- '~': '—',
244
- '〜': '—',
245
- '—': '—',
246
- '«': '«',
247
- '»': '»',
248
-
249
- # other
250
- '゚': '', # combining handakuten by itself, just discard
251
- '゙': '', # combining dakuten by itself
252
- })
253
-
254
- def add_dakuten(kk):
255
- """Given a kana (single-character string), add a dakuten."""
256
- try:
257
- # ii = 'かきくけこさしすせそたちつてとはひふへほ'.index(kk)
258
- ii = 'カキクケコサシスセソタチツテトハヒフヘホ'.index(kk)
259
- return 'ガギグゲゴザジズゼゾダヂヅデドバビブベボ'[ii]
260
- # return 'がぎぐげござじずぜぞだぢづでどばびぶべぼ'[ii]
261
- except ValueError:
262
- # this is normal if the input is nonsense
263
- return None
264
-
265
- SUTEGANA = 'ャュョァィゥェォ' #'ゃゅょぁぃぅぇぉ'
266
- PUNCT = '\'".!?(),;:-'
267
- ODORI = '々〃ゝゞヽゞ'
268
-
269
- @dataclass
270
- class Token:
271
- surface: str
272
- space: bool # if a space should follow
273
- def __str__(self):
274
- sp = " " if self.space else ""
275
- return f"{self.surface}{sp}"
276
-
277
- class Katsu:
278
- def __init__(self):
279
- """Create a Katsu object, which holds configuration as well as
280
- tokenizer state.
281
-
282
- Typical usage:
283
-
284
- ```python
285
- katsu = Katsu()
286
- roma = katsu.romaji("カツカレーを食べた")
287
- # "Cutlet curry wo tabeta"
288
- ```
289
- """
290
- self.tagger = Tagger()
291
- self.table = dict(HEPBURN) # make a copy so we can modify it
292
- self.exceptions = {}
293
-
294
- def romaji(self, text):
295
- """Build a complete string from input text."""
296
- if not text:
297
- return ''
298
- text = self._normalize_text(text)
299
- words = self.tagger(text)
300
- tokens = self._romaji_tokens(words)
301
- out = ''.join([str(tok) for tok in tokens])
302
- return re.sub(r'\s+', ' ', out.strip())
303
-
304
- def phonemize(self, texts):
305
- # espeak-ng API
306
- return [self.romaji(text) for text in texts]
307
-
308
- def _normalize_text(self, text):
309
- """Given text, normalize variations in Japanese.
310
-
311
- This specifically removes variations that are meaningless for romaji
312
- conversion using the following steps:
313
-
314
- - Unicode NFKC normalization
315
- - Full-width Latin to half-width
316
- - Half-width katakana to full-width
317
- """
318
- # perform unicode normalization
319
- text = re.sub(r'[〜~](?=\d)', 'から', text) # wave dash range
320
- text = unicodedata.normalize('NFKC', text)
321
- # convert all full-width alphanum to half-width, since it can go out as-is
322
- text = mojimoji.zen_to_han(text, kana=False)
323
- # replace half-width katakana with full-width
324
- text = mojimoji.han_to_zen(text, digit=False, ascii=False)
325
- return ''.join([(' '+Convert(t)) if t.isdigit() else t for t in re.findall(r'\d+|\D+', text)])
326
-
327
- def _romaji_tokens(self, words):
328
- """Build a list of tokens from input nodes."""
329
- out = []
330
- for wi, word in enumerate(words):
331
- po = out[-1] if out else None
332
- pw = words[wi - 1] if wi > 0 else None
333
- nw = words[wi + 1] if wi < len(words) - 1 else None
334
- roma = self._romaji_word(word)
335
- tok = Token(roma, False)
336
- # handle punctuation with atypical spacing
337
- surface = word.surface#['orig']
338
- if surface in '「『' or roma in '([':
339
- if po:
340
- po.space = True
341
- elif surface in '」』' or roma in ']).,?!:':
342
- if po:
343
- po.space = False
344
- tok.space = True
345
- elif roma == ' ':
346
- tok.space = False
347
- else:
348
- tok.space = True
349
- out.append(tok)
350
- # remove any leftover sokuon
351
- for tok in out:
352
- tok.surface = tok.surface.replace(chr(12483), '')
353
- return out
354
-
355
- def _romaji_word(self, word):
356
- """Return the romaji for a single word (node)."""
357
- surface = word.surface#['orig']
358
- if surface in self.exceptions:
359
- return self.exceptions[surface]
360
- assert not surface.isdigit(), surface
361
- if surface.isascii():
362
- return surface
363
- kana = word.feature.pron or word.feature.kana or surface
364
- if word.is_unk:
365
- if word.char_type == 7: # katakana
366
- pass
367
- elif word.char_type == 3: # symbol
368
- return ''.join(map(lambda c: self.table.get(c, c), surface))
369
- else:
370
- return '' # TODO: silently fail
371
- out = ''
372
- for ki, char in enumerate(kana):
373
- nk = kana[ki + 1] if ki < len(kana) - 1 else None
374
- pk = kana[ki - 1] if ki > 0 else None
375
- out += self._get_single_mapping(pk, char, nk)
376
- return out
377
-
378
- def _get_single_mapping(self, pk, kk, nk):
379
- """Given a single kana and its neighbors, return the mapped romaji."""
380
- # handle odoriji
381
- # NOTE: This is very rarely useful at present because odoriji are not
382
- # left in readings for dictionary words, and we can't follow kana
383
- # across word boundaries.
384
- if kk in ODORI:
385
- if kk in 'ゝヽ':
386
- if pk: return pk
387
- else: return '' # invalid but be nice
388
- if kk in 'ゞヾ': # repeat with voicing
389
- if not pk: return ''
390
- vv = add_dakuten(pk)
391
- if vv: return self.table[vv]
392
- else: return ''
393
- # remaining are 々 for kanji and 〃 for symbols, but we can't
394
- # infer their span reliably (or handle rendaku)
395
- return ''
396
- # handle digraphs
397
- if pk and (pk + kk) in self.table:
398
- return self.table[pk + kk]
399
- if nk and (kk + nk) in self.table:
400
- return ''
401
- if nk and nk in SUTEGANA:
402
- if kk == 'ッ': return '' # never valid, just ignore
403
- return self.table[kk][:-1] + self.table[nk]
404
- if kk in SUTEGANA:
405
- return ''
406
- if kk == 'ー': # 長音符
407
- return 'ː'
408
- if ord(kk) in {12387, 12483}: # っ or ッ
409
- tnk = self.table.get(nk)
410
- if tnk and tnk[0] in 'bdɸɡhçijkmnɲoprstɯvwz':
411
- return tnk[0]
412
- return kk
413
- if ord(kk) in {12435, 12531}: # ん or ン
414
- # https://en.wikipedia.org/wiki/N_(kana)
415
- # m before m,p,b
416
- # ŋ before k,g
417
- # ɲ before ɲ,tɕ,dʑ
418
- # n before n,t,d,r,z
419
- # ɴ otherwise
420
- tnk = self.table.get(nk)
421
- if tnk:
422
- if tnk[0] in 'mpb':
423
- return 'm'
424
- elif tnk[0] in 'kɡ':
425
- return 'ŋ'
426
- elif any(tnk.startswith(p) for p in ('ɲ','tɕ','dʑ')):
427
- return 'ɲ'
428
- elif tnk[0] in 'ntdrz':
429
- return 'n'
430
- return 'ɴ'
431
- return self.table.get(kk, '')