indic-seamless / lang_list.py
AshwinSankar's picture
initial commit
e9706fe
# Language dict
language_code_to_name = {
"asm": "Assamese",
"ben": "Bengali",
"guj": "Gujarati",
"hin": "Hindi",
"kan": "Kannada",
"mal": "Malayalam",
"mar": "Marathi",
"ory": "Odia",
"pan": "Punjabi",
"tam": "Tamil",
"tel": "Telugu",
"urd": "Urdu",
"eng": "English"
}
LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
# Source langs: S2ST / S2TT / ASR don't need source lang
# T2TT / T2ST use this
text_source_language_codes = [
"asm",
"ben",
"guj",
"hin",
"kan",
"mal",
"mar",
"ory",
"pan",
"tam",
"tel",
"urd",
"eng"
]
TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
# Target langs:
# S2ST / T2ST
s2st_target_language_codes = [
"asm",
"ben",
"guj",
"hin",
"kan",
"mal",
"mar",
"ory",
"pan",
"tam",
"tel",
"urd",
"eng"
]
S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
T2ST_TARGET_LANGUAGE_NAMES = S2ST_TARGET_LANGUAGE_NAMES
# S2TT / T2TT / ASR
S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
ASR_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES