Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,148 Bytes
7798d43 38742d7 71ae380 38742d7 71ae380 5e1003d d5f8a9a 38742d7 74a629f 38742d7 6bcde50 38742d7 74a629f 38742d7 86d577a 3f23d73 38742d7 3f23d73 38742d7 7798d43 5e1003d 3f23d73 e72a9c0 d5f8a9a e72a9c0 15ccfd9 38742d7 a47c01b 7dc20b3 a47c01b 1473813 38742d7 a47c01b 38742d7 74a629f 38742d7 d5f8a9a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
import spaces
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from flores import code_mapping
import platform
import torch
import nltk
nltk.download("punkt_tab")
REMOVED_TARGET_LANGUAGES = {"Ligurian", "Lombard", "Sicilian"}
device = "cpu" if platform.system() == "Darwin" else "cuda"
MODEL_NAME = "facebook/nllb-200-3.3B"
code_mapping = dict(sorted(code_mapping.items(), key=lambda item: item[1]))
flores_codes = list(code_mapping.keys())
target_languages = [language for language in flores_codes if not language in REMOVED_TARGET_LANGUAGES]
def load_model():
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)
print(f"Model loaded in {device}")
return model
model = load_model()
def load_tokenizer(src_lang, tgt_lang):
tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME, src_lang=code_mapping[src_lang], tgt_lang=code_mapping[tgt_lang]
)
return tokenizer
@spaces.GPU
def translate(text: str, src_lang: str, tgt_lang: str):
tokenizer = load_tokenizer(src_lang, tgt_lang)
paragraphs = text.split("\n")
translated_paragraphs = []
for paragraph in paragraphs:
sentences = nltk.sent_tokenize(paragraph)
translated_sentences = []
for sentence in sentences:
input_tokens = (
tokenizer(sentence, return_tensors="pt")
.input_ids[0]
.cpu()
.numpy()
.tolist()
)
translated_chunk = model.generate(
input_ids=torch.tensor([input_tokens]).to(device),
forced_bos_token_id=tokenizer.convert_tokens_to_ids(code_mapping[tgt_lang]),
max_length=len(input_tokens) + 50,
num_return_sequences=1,
)
translated_chunk = tokenizer.decode(
translated_chunk[0], skip_special_tokens=True
)
translated_sentences.append(translated_chunk)
translated_paragraph = " ".join(translated_sentences)
translated_paragraphs.append(translated_paragraph)
return "\n".join(translated_paragraphs)
description = """
UNESCO, Meta, and Hugging Face have come together to create an accessible, high-quality translation experience in 200 languages.
This is made possible through an open approach to AI innovation using Meta’s open-sourced No Language Left Behind (NLLB) AI model, hosted on Hugging Face Spaces.
"""
with gr.Blocks() as demo:
gr.Markdown("# UNESCO Language Translator, powered by Meta and Hugging Face")
gr.Markdown(description)
with gr.Row():
src_lang = gr.Dropdown(label="Source Language", choices=flores_codes)
target_lang = gr.Dropdown(label="Target Language", choices=target_languages)
with gr.Row():
input_text = gr.Textbox(label="Input Text", lines=6)
with gr.Row():
btn = gr.Button("Translate text")
with gr.Row():
output = gr.Textbox(label="Output Text", lines=6)
btn.click(
translate,
inputs=[input_text, src_lang, target_lang],
outputs=output,
)
demo.launch() |