Spaces:

hci-lab-dcug
/

UG_Maternal_Health_Translate

Sleeping

App Files Files Community

nyarkssss commited on Aug 31

Commit

ab00c83

1 Parent(s): 5adcf83

initial commit

Browse files

Files changed (4) hide show

app.py +37 -0
flores200_codes.py +12 -0
nllb.py +63 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import gradio as gr
+from nllb import translation, NLLB_EXAMPLES
+from flores200_codes import flores_codes
+lang_codes = list(flores_codes.keys())
+nllb_translate = gr.Interface(
+    fn=translation,
+    inputs=[
+        gr.Dropdown(
+            ["twi_en_matgsmol", "nllb-200-distilled-600M"],
+            label="Model",
+            value="twi_en_matgsmol",
+        ),
+        gr.Dropdown(
+            lang_codes,
+            label="Source language",
+            value="English",
+        ),
+        gr.Dropdown(
+            lang_codes,
+            label="Target language",
+            value="Akan",
+        ),
+        gr.Textbox(lines=5, label="Input text"),
+    ],
+    outputs="json",
+    examples=NLLB_EXAMPLES,
+    title="NLLB Translation Demo",
+    description="Translate text from one language to another.",
+    allow_flagging="never",
+)
+with gr.Blocks() as demo:
+    nllb_translate.render()
+demo.launch()

flores200_codes.py ADDED Viewed

	@@ -0,0 +1,12 @@

+codes_as_string = '''Acehnese (Arabic script)	ace_Arab
+Akan	aka_Latn
+English	eng_Latn
+Twi	twi_Latn
+'''
+codes_as_string = codes_as_string.split('\n')
+flores_codes = {}
+for code in codes_as_string:
+    lang, lang_code = code.split('\t')
+    flores_codes[lang] = lang_code

nllb.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import os
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+from flores200_codes import flores_codes
+hf_token = auth_token = os.environ.get("HF_TOKEN") or True
+model_dict = {}
+def load_models(model_name: str):
+    # build model and tokenizer
+    model_name_dict = {
+        "twi_en_matgsmol": "nyarkssss/twi_en_matgsmol",
+        "nllb-200-distilled-600M": "facebook/nllb-200-distilled-600M",
+    }[model_name]
+    print("\tLoading model: %s" % model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name_dict, use_auth_token=auth_token)
+    tokenizer = AutoTokenizer.from_pretrained(model_name_dict, use_auth_token=auth_token)
+    model_dict[model_name + "_model"] = model
+    model_dict[model_name + "_tokenizer"] = tokenizer
+    return model_dict
+def translation(model_name: str, source, target, text: str):
+    model_dict = load_models(model_name)
+    source = flores_codes[source]
+    target = flores_codes[target]
+    model = model_dict[model_name + "_model"]
+    tokenizer = model_dict[model_name + "_tokenizer"]
+    translator = pipeline(
+        "translation",
+        model=model,
+        tokenizer=tokenizer,
+        src_lang=source,
+        tgt_lang=target,
+    )
+    output = translator(text, max_length=400)
+    output = output[0]["translation_text"]
+    result = {
+        "source": source,
+        "target": target,
+        "result": output,
+    }
+    return result
+NLLB_EXAMPLES = [
+    ["nllb-200-distilled-600M", "English", "Akan", "Hello, how are you today?"],
+    ["nllb-200-distilled-600M", "Akan", "English", "Me adwuma anopa yi."],
+    [
+        "nllb-200-distilled-600M",
+        "English",
+        "Akan",
+        "The government needs to invest more in education to secure the country's future.",
+    ],
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+#git+https://github.com/huggingface/transformers
+#gradio
+#torch
+gradio
+transformers
+torch
+torchaudio