nyarkssss commited on
Commit
ab00c83
·
1 Parent(s): 5adcf83

initial commit

Browse files
Files changed (4) hide show
  1. app.py +37 -0
  2. flores200_codes.py +12 -0
  3. nllb.py +63 -0
  4. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from nllb import translation, NLLB_EXAMPLES
3
+ from flores200_codes import flores_codes
4
+
5
+ lang_codes = list(flores_codes.keys())
6
+
7
+ nllb_translate = gr.Interface(
8
+ fn=translation,
9
+ inputs=[
10
+ gr.Dropdown(
11
+ ["twi_en_matgsmol", "nllb-200-distilled-600M"],
12
+ label="Model",
13
+ value="twi_en_matgsmol",
14
+ ),
15
+ gr.Dropdown(
16
+ lang_codes,
17
+ label="Source language",
18
+ value="English",
19
+ ),
20
+ gr.Dropdown(
21
+ lang_codes,
22
+ label="Target language",
23
+ value="Akan",
24
+ ),
25
+ gr.Textbox(lines=5, label="Input text"),
26
+ ],
27
+ outputs="json",
28
+ examples=NLLB_EXAMPLES,
29
+ title="NLLB Translation Demo",
30
+ description="Translate text from one language to another.",
31
+ allow_flagging="never",
32
+ )
33
+
34
+ with gr.Blocks() as demo:
35
+ nllb_translate.render()
36
+
37
+ demo.launch()
flores200_codes.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ codes_as_string = '''Acehnese (Arabic script) ace_Arab
2
+ Akan aka_Latn
3
+ English eng_Latn
4
+ Twi twi_Latn
5
+ '''
6
+
7
+ codes_as_string = codes_as_string.split('\n')
8
+
9
+ flores_codes = {}
10
+ for code in codes_as_string:
11
+ lang, lang_code = code.split('\t')
12
+ flores_codes[lang] = lang_code
nllb.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
3
+ from flores200_codes import flores_codes
4
+
5
+ hf_token = auth_token = os.environ.get("HF_TOKEN") or True
6
+ model_dict = {}
7
+
8
+
9
+ def load_models(model_name: str):
10
+ # build model and tokenizer
11
+ model_name_dict = {
12
+ "twi_en_matgsmol": "nyarkssss/twi_en_matgsmol",
13
+ "nllb-200-distilled-600M": "facebook/nllb-200-distilled-600M",
14
+ }[model_name]
15
+
16
+ print("\tLoading model: %s" % model_name)
17
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name_dict, use_auth_token=auth_token)
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name_dict, use_auth_token=auth_token)
19
+ model_dict[model_name + "_model"] = model
20
+ model_dict[model_name + "_tokenizer"] = tokenizer
21
+
22
+ return model_dict
23
+
24
+
25
+ def translation(model_name: str, source, target, text: str):
26
+
27
+ model_dict = load_models(model_name)
28
+
29
+ source = flores_codes[source]
30
+ target = flores_codes[target]
31
+
32
+ model = model_dict[model_name + "_model"]
33
+ tokenizer = model_dict[model_name + "_tokenizer"]
34
+
35
+ translator = pipeline(
36
+ "translation",
37
+ model=model,
38
+ tokenizer=tokenizer,
39
+ src_lang=source,
40
+ tgt_lang=target,
41
+ )
42
+ output = translator(text, max_length=400)
43
+
44
+ output = output[0]["translation_text"]
45
+ result = {
46
+ "source": source,
47
+ "target": target,
48
+ "result": output,
49
+ }
50
+
51
+ return result
52
+
53
+
54
+ NLLB_EXAMPLES = [
55
+ ["nllb-200-distilled-600M", "English", "Akan", "Hello, how are you today?"],
56
+ ["nllb-200-distilled-600M", "Akan", "English", "Me adwuma anopa yi."],
57
+ [
58
+ "nllb-200-distilled-600M",
59
+ "English",
60
+ "Akan",
61
+ "The government needs to invest more in education to secure the country's future.",
62
+ ],
63
+ ]
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #git+https://github.com/huggingface/transformers
2
+ #gradio
3
+ #torch
4
+ gradio
5
+ transformers
6
+ torch
7
+ torchaudio