Sunday01's picture
up
9dce458
import os
import ctranslate2
import sentencepiece as spm
from typing import List
from .common import OfflineTranslator
# Adapted from:
# https://gist.github.com/ymoslem/a414a0ead0d3e50f4d7ff7110b1d1c0d
# https://github.com/ymoslem/DesktopTranslator
class M2M100Translator(OfflineTranslator):
# Refer to https://github.com/ymoslem/DesktopTranslator/blob/main/utils/m2m_languages.json
# other languages can be added as well
_LANGUAGE_CODE_MAP = {
'CHS': '__zh__',
'CHT': '__zh__',
'CSY': '__cs__',
'NLD': '__nl__',
'ENG': '__en__',
'FRA': '__fr__',
'DEU': '__de__',
'HUN': '__hu__',
'ITA': '__it__',
'JPN': '__ja__',
'KOR': '__ko__',
'PLK': '__pl__',
'PTB': '__pt__',
'ROM': '__ro__',
'RUS': '__ru__',
'ESP': '__es__',
'TRK': '__tr__',
'UKR': '__uk__',
'VIN': '__vi__',
'ARA': '__ar__',
'SRP': '__sr__',
'HRV': '__hr__',
'THA': '__th__',
'IND': '__id__'
}
_MODEL_SUB_DIR = os.path.join(OfflineTranslator._MODEL_SUB_DIR, 'm2m_100')
_CT2_MODEL_DIR = 'm2m100_418m'
_MODEL_MAPPING = {
'models': {
'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/m2m100_418m_ct2.zip',
'hash': '8a9cd0e00505a7879f26e5a1b396b447bc29967783a1e17e8df5eecb0c13d1c3',
'archive': {
'm2m100_418m/': '.',
},
},
}
async def _load(self, from_lang: str, to_lang: str, device: str):
self.load_params = {
'from_lang': from_lang,
'to_lang': to_lang,
'device': device,
}
self.model = ctranslate2.Translator(
model_path=self._get_file_path(self._CT2_MODEL_DIR),
device=device,
device_index=0,
)
self.model.load_model()
self.sentence_piece_processor = spm.SentencePieceProcessor(model_file=self._get_file_path(self._CT2_MODEL_DIR, 'sentencepiece.model'))
async def _unload(self):
self.model.unload_model()
del self.model
del self.sentence_piece_processor
async def _infer(self, from_lang: str, to_lang: str, queries: List[str]) -> List[str]:
queries_tokenized = self.tokenize(queries, from_lang)
translated_tokenized = self.model.translate_batch(
source=queries_tokenized,
target_prefix=[[to_lang]] * len(queries),
beam_size=5,
max_batch_size=1024,
return_alternatives=False,
disable_unk=True,
replace_unknowns=True,
repetition_penalty=3,
)
translated = self.detokenize(list(map(lambda t: t[0]['tokens'], translated_tokenized)), to_lang)
return translated
def tokenize(self, queries, lang):
sp = self.sentence_piece_processor
if isinstance(queries, list):
return sp.encode(queries, out_type=str)
else:
return [sp.encode(queries, out_type=str)]
def detokenize(self, queries, lang):
sp = self.sentence_piece_processor
translation = sp.decode(queries)
prefix_len = len(lang) + 1
translation = [''.join(query)[prefix_len:] for query in translation]
return translation
class M2M100BigTranslator(M2M100Translator):
_MODEL_SUB_DIR = os.path.join(OfflineTranslator._MODEL_SUB_DIR, 'm2m_100')
_CT2_MODEL_DIR = 'm2m100_12b'
_MODEL_MAPPING = {
'models': {
'url': 'https://github.com/zyddnys/manga-image-translator/releases/download/beta-0.3/m2m100_12b_ct2.zip',
'hash': '742d5380c2837affd3680339145d37fc78f537ad633958347b76e9be9c577662',
'archive': {
'm2m100_12b/': '.',
},
},
}