File size: 4,756 Bytes
38742d7
 
71ae380
38742d7
 
71ae380
5e1003d
 
aac03a6
38742d7
74a629f
 
 
38742d7
6bcde50
38742d7
 
 
74a629f
38742d7
 
 
86d577a
3f23d73
38742d7
 
3f23d73
 
 
 
 
 
 
 
38742d7
 
 
5e1003d
3f23d73
 
e72a9c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aac03a6
e72a9c0
 
 
 
 
 
 
 
 
 
 
 
15ccfd9
 
10a52bd
38742d7
10a52bd
 
a47c01b
7dc20b3
10a52bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1473813
38742d7
 
 
 
 
74a629f
38742d7
 
 
 
 
 
 
 
 
 
 
10a52bd
 
aac03a6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import spaces
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from flores import code_mapping
import platform
import torch
import nltk

nltk.download("punkt_tab")

REMOVED_TARGET_LANGUAGES = {"Ligurian", "Lombard", "Sicilian"}


device = "cpu" if platform.system() == "Darwin" else "cuda"
MODEL_NAME = "facebook/nllb-200-3.3B"

code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1]))
flores_codes = list(code_mapping.keys())
target_languages = [language for language in flores_codes if not language in REMOVED_TARGET_LANGUAGES]

def load_model():
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
    print(f"Model loaded in {device}")
    return model


model = load_model()


def load_tokenizer(src_lang, tgt_lang):
    tokenizer = AutoTokenizer.from_pretrained(
        MODEL_NAME, src_lang=code_mapping[src_lang], tgt_lang=code_mapping[tgt_lang]
    )
    return tokenizer


@spaces.GPU
def translate(text: str, src_lang: str, tgt_lang: str):
    tokenizer = load_tokenizer(src_lang, tgt_lang)

    paragraphs = text.split("\n")
    translated_paragraphs = []

    for paragraph in paragraphs:
        sentences = nltk.sent_tokenize(paragraph)
        translated_sentences = []

        for sentence in sentences:
            input_tokens = (
                tokenizer(sentence, return_tensors="pt")
                .input_ids[0]
                .cpu()
                .numpy()
                .tolist()
            )
            translated_chunk = model.generate(
                input_ids=torch.tensor([input_tokens]).to(device),
                forced_bos_token_id=tokenizer.convert_tokens_to_ids(code_mapping[tgt_lang]),
                max_length=len(input_tokens) + 50,
                num_return_sequences=1,
            )
            translated_chunk = tokenizer.decode(
                translated_chunk[0], skip_special_tokens=True
            )
            translated_sentences.append(translated_chunk)

        translated_paragraph = " ".join(translated_sentences)
        translated_paragraphs.append(translated_paragraph)

    return "\n".join(translated_paragraphs)



description = """
# UNESCO Language Translator, powered by Meta and Hugging Face

UNESCO, Meta, and Hugging Face have come together to create an accessible, high-quality translation experience in 200 languages. 

This is made possible through an open approach to AI innovation using Meta's open-sourced No Language Left Behind (NLLB) AI model, hosted on Hugging Face Spaces. """

disclaimer = """
## Disclaimer

This translation interface, developed as part of UNESCO's work on Multilingualism and supported by Meta's No Language Left Behind AI model and Hugging Face, is designed to assist with language translation using open-source AI technologies. However, translations generated by the tool may not be accurate or perfect. While we strive to provide accurate translations, the tool may produce inaccuracies due to the complexity and nuances of different languages.

- The tool may not fully capture the context, cultural nuances, idiomatic expressions, or specific terminologies.
- Manual review and adjustment are recommended for important translations.
- The translations are provided "as is" without any warranties of any kind, either expressed or implied.
- Users should not rely solely on the tool for critical or sensitive translations and are responsible for verifying the accuracy and appropriateness of the translations for their specific needs.
- We recommend consulting with professional translators for official, legal, medical, or other critical translations.
- We shall not be liable for any direct, indirect, incidental, special, or consequential damages arising out of or in connection with the use or inability to use the translation tool, including but not limited to errors or omissions in translations.

By using this translation tool, you agree to these terms and acknowledge that the use of the tool is at your own risk.

For any feedback or support, please contact UNESCO World Atlas of Languages Team: [email protected].
"""

with gr.Blocks() as demo:
    gr.Markdown(description)
    with gr.Row():
        src_lang = gr.Dropdown(label="Source Language", choices=flores_codes)
        target_lang = gr.Dropdown(label="Target Language", choices=target_languages)
    with gr.Row():
        input_text = gr.Textbox(label="Input Text", lines=6)
    with gr.Row():
        btn = gr.Button("Translate text")
    with gr.Row():
        output = gr.Textbox(label="Output Text", lines=6)
    btn.click(
        translate,
        inputs=[input_text, src_lang, target_lang],
        outputs=output,
    )
    with gr.Row():
        gr.Markdown(disclaimer)
demo.launch()