English to Tagin translator v0.1.0 - initial commit
Browse files- .gitattributes +1 -0
- app.py +120 -0
- models/config.json +56 -0
- models/generation_config.json +16 -0
- models/model.safetensors +3 -0
- models/source.spm +0 -0
- models/special_tokens_map.json +5 -0
- models/target.spm +3 -0
- models/tokenizer_config.json +38 -0
- models/training_args.bin +3 -0
- models/vocab.json +0 -0
- requirements.txt +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
models/target.spm filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import pipeline
|
2 |
+
from datasets import Dataset
|
3 |
+
import streamlit as st
|
4 |
+
|
5 |
+
# Set the background color and layout with set_page_config
|
6 |
+
st.set_page_config(
|
7 |
+
page_title="English to Tagin Translator",
|
8 |
+
page_icon=":repeat:",
|
9 |
+
layout="wide",
|
10 |
+
)
|
11 |
+
|
12 |
+
# Streamlit app setup
|
13 |
+
st.title(":repeat: English to Tagin Translator")
|
14 |
+
st.markdown("Welcome to the English to Tagin Translator. :sparkles: Simply enter your text in English, and get the translation in Tagin instantly! :thumbsup:")
|
15 |
+
|
16 |
+
# Text input
|
17 |
+
if 'text_input' not in st.session_state:
|
18 |
+
st.session_state.text_input = ""
|
19 |
+
text_input = st.text_area("Enter English text to translate", height=150, value=st.session_state.text_input)
|
20 |
+
|
21 |
+
# Define your model directory
|
22 |
+
model_directory = "models"
|
23 |
+
|
24 |
+
# Initialize the pipeline for translation
|
25 |
+
translation_pipeline = pipeline(
|
26 |
+
task="translation",
|
27 |
+
model=model_directory,
|
28 |
+
tokenizer=model_directory,
|
29 |
+
device=0 # Use GPU (if available)
|
30 |
+
)
|
31 |
+
|
32 |
+
# Translate button
|
33 |
+
if st.button("Translate", key="translate_button"):
|
34 |
+
if text_input:
|
35 |
+
with st.spinner("Translating... Please wait"):
|
36 |
+
# Prepare data for translation
|
37 |
+
sentences = [text_input]
|
38 |
+
data = Dataset.from_dict({"text": sentences})
|
39 |
+
|
40 |
+
# Apply translation
|
41 |
+
try:
|
42 |
+
results = data.map(lambda x: {"translation": translation_pipeline(x["text"])})
|
43 |
+
result = results[0]["translation"][0]['translation_text']
|
44 |
+
|
45 |
+
# Capitalize the first letter of the result
|
46 |
+
result = result.capitalize()
|
47 |
+
|
48 |
+
# Display translation result with custom styling
|
49 |
+
st.markdown("#### Translated text:")
|
50 |
+
st.markdown(f'<h2 class="result-text">{result}</2>', unsafe_allow_html=True)
|
51 |
+
# st.markdown(result)
|
52 |
+
|
53 |
+
except Exception as e:
|
54 |
+
st.error(f"Translation error: {e}")
|
55 |
+
else:
|
56 |
+
st.warning("Please enter text to translate.")
|
57 |
+
|
58 |
+
# Clear input button
|
59 |
+
if st.button("Clear Input"):
|
60 |
+
st.session_state.text_input = ""
|
61 |
+
|
62 |
+
st.markdown("<br>" * 3, unsafe_allow_html=True)
|
63 |
+
st.markdown("""
|
64 |
+
❗❗❗ **Please note:** The English-to-Tagin translator is still in its initial phase, so it may provide incorrect translations at times. Your understanding is appreciated!
|
65 |
+
|
66 |
+
🤝 **For contributions or inquiries, feel free to contact me!**
|
67 |
+
|
68 |
+
""")
|
69 |
+
|
70 |
+
st.markdown("""
|
71 |
+
### Tagin Language
|
72 |
+
|
73 |
+
Tagin is a beautiful language spoken by the Tagin tribe and belongs to the Tani group of Sino-Tibetan languages. You'll mainly find Tagin speakers in the Upper Subansiri, Shiyomi, Kara Dadi, Kurung Kumey, and Papum Pare districts of Arunachal Pradesh, India. While about 63,000 (according to 2011 Census of India) people speak Tagin as their mother tongue, UNESCO has marked it as 'definitely endangered', which means it's at risk of disappearing. Unfortunately, very few written materials exist in Tagin, which makes it hard to study and preserve the language.
|
74 |
+
|
75 |
+
As a small contribution to preserving this rich cultural heritage, I've developed this English-Tagin translator using the GinLish Corpus v0.1.0. By creating this digital tool, I hope to help keep the Tagin language alive and make it more accessible to both the Tagin community and language enthusiasts. This project is my way of giving back to society and helping protect an important piece of our cultural diversity.
|
76 |
+
|
77 |
+
|
78 |
+
""")
|
79 |
+
|
80 |
+
st.markdown("""
|
81 |
+
### GinLish Corpus v0.1.0 (2024)
|
82 |
+
|
83 |
+
I'm excited:satisfied: to share that I created the GinLish Corpus v0.1.0, which is actually the first-ever collection of matched Tagin and English sentences. The corpus contains 60,000 carefully paired sentences that captures how these languages relate to each other. To build this, I translated English sentences from the Tatoeba website into Tagin and included traditional Tagin folk stories too.
|
84 |
+
What makes this special is that I made sure to keep the true essence of the Tagin language alive in the translations. This means including Tagin sayings, cultural elements, and the unique way Tagin people express themselves. All this careful attention to detail makes the corpus really valuable for building translation tools, studying the language, and helping people learn Tagin. It's not just a simple word-for-word translation - it's a bridge between these two languages that respects and preserves Tagin's cultural identity.
|
85 |
+
|
86 |
+
Good news:smiley: for researchers and language enthusiasts - I plan to release this dataset for non-commercial use once I complete my PhD!🎓 This way, others can also contribute to preserving and studying this beautiful language.
|
87 |
+
|
88 |
+
""")
|
89 |
+
# Sidebar for About and Contact information
|
90 |
+
st.sidebar.header("About the Developer")
|
91 |
+
|
92 |
+
st.sidebar.markdown("""
|
93 |
+
Hey there! 👋
|
94 |
+
|
95 |
+
I’m **Tungon Dugi**.
|
96 |
+
|
97 |
+
Right now, I’m doing my PhD in Computer Science and Engineering at NIT Arunachal Pradesh. 🎓
|
98 |
+
|
99 |
+
I’ve got a keen interest in Natural Language Processing (NLP), Machine Translation (MT), Deep Learning, and Linguistics.
|
100 |
+
|
101 |
+
💻✨ I love exploring how tech can help preserve and promote low-resource languages, especially my own language, **Tagin**! 🌍💬""")
|
102 |
+
|
103 |
+
|
104 |
+
# Create some space between main sidebar content and footer
|
105 |
+
st.sidebar.markdown("<br>" * 5, unsafe_allow_html=True)
|
106 |
+
# detailed version using sidebar:
|
107 |
+
st.sidebar.markdown("---")
|
108 |
+
st.sidebar.caption("Made with ❤️ by Tungon Dugi")
|
109 |
+
st.sidebar.caption("Contact: [email protected]")
|
110 |
+
|
111 |
+
# Or using columns in sidebar:
|
112 |
+
col1, col2 = st.sidebar.columns(2)
|
113 |
+
with col1:
|
114 |
+
st.caption("© 2024")
|
115 |
+
with col2:
|
116 |
+
st.caption("v0.1.0")
|
117 |
+
|
118 |
+
# To push it to very bottom of sidebar, add more space:
|
119 |
+
st.sidebar.markdown("<br>" * 10, unsafe_allow_html=True)
|
120 |
+
st.sidebar.caption("Bottom Footer")
|
models/config.json
ADDED
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Helsinki-NLP/opus-mt-en-hi",
|
3 |
+
"activation_dropout": 0.0,
|
4 |
+
"activation_function": "swish",
|
5 |
+
"add_bias_logits": false,
|
6 |
+
"add_final_layer_norm": false,
|
7 |
+
"architectures": [
|
8 |
+
"MarianMTModel"
|
9 |
+
],
|
10 |
+
"attention_dropout": 0.0,
|
11 |
+
"bos_token_id": 0,
|
12 |
+
"classif_dropout": 0.0,
|
13 |
+
"classifier_dropout": 0.0,
|
14 |
+
"d_model": 512,
|
15 |
+
"decoder_attention_heads": 8,
|
16 |
+
"decoder_ffn_dim": 2048,
|
17 |
+
"decoder_layerdrop": 0.0,
|
18 |
+
"decoder_layers": 6,
|
19 |
+
"decoder_start_token_id": 61949,
|
20 |
+
"decoder_vocab_size": 61950,
|
21 |
+
"dropout": 0.1,
|
22 |
+
"encoder_attention_heads": 8,
|
23 |
+
"encoder_ffn_dim": 2048,
|
24 |
+
"encoder_layerdrop": 0.0,
|
25 |
+
"encoder_layers": 6,
|
26 |
+
"eos_token_id": 0,
|
27 |
+
"extra_pos_embeddings": 61950,
|
28 |
+
"forced_eos_token_id": 0,
|
29 |
+
"id2label": {
|
30 |
+
"0": "LABEL_0",
|
31 |
+
"1": "LABEL_1",
|
32 |
+
"2": "LABEL_2"
|
33 |
+
},
|
34 |
+
"init_std": 0.02,
|
35 |
+
"is_encoder_decoder": true,
|
36 |
+
"label2id": {
|
37 |
+
"LABEL_0": 0,
|
38 |
+
"LABEL_1": 1,
|
39 |
+
"LABEL_2": 2
|
40 |
+
},
|
41 |
+
"max_length": null,
|
42 |
+
"max_position_embeddings": 512,
|
43 |
+
"model_type": "marian",
|
44 |
+
"normalize_before": false,
|
45 |
+
"normalize_embedding": false,
|
46 |
+
"num_beams": null,
|
47 |
+
"num_hidden_layers": 6,
|
48 |
+
"pad_token_id": 61949,
|
49 |
+
"scale_embedding": true,
|
50 |
+
"share_encoder_decoder_embeddings": true,
|
51 |
+
"static_position_embeddings": true,
|
52 |
+
"torch_dtype": "float32",
|
53 |
+
"transformers_version": "4.46.1",
|
54 |
+
"use_cache": true,
|
55 |
+
"vocab_size": 61950
|
56 |
+
}
|
models/generation_config.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bad_words_ids": [
|
3 |
+
[
|
4 |
+
61949
|
5 |
+
]
|
6 |
+
],
|
7 |
+
"bos_token_id": 0,
|
8 |
+
"decoder_start_token_id": 61949,
|
9 |
+
"eos_token_id": 0,
|
10 |
+
"forced_eos_token_id": 0,
|
11 |
+
"max_length": 512,
|
12 |
+
"num_beams": 4,
|
13 |
+
"pad_token_id": 61949,
|
14 |
+
"renormalize_logits": true,
|
15 |
+
"transformers_version": "4.46.1"
|
16 |
+
}
|
models/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0021e11c864a4515bbc4d35203cef19270e127811ab827bf73151b2aa673200d
|
3 |
+
size 303704440
|
models/source.spm
ADDED
Binary file (812 kB). View file
|
|
models/special_tokens_map.json
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"eos_token": "</s>",
|
3 |
+
"pad_token": "<pad>",
|
4 |
+
"unk_token": "<unk>"
|
5 |
+
}
|
models/target.spm
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f5529d3a72f8c1d5f7e357f1b6fd30e3cf58f6e1ba0401db135a118ac92f4a76
|
3 |
+
size 1067935
|
models/tokenizer_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"0": {
|
4 |
+
"content": "</s>",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"1": {
|
12 |
+
"content": "<unk>",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"61949": {
|
20 |
+
"content": "<pad>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"clean_up_tokenization_spaces": false,
|
29 |
+
"eos_token": "</s>",
|
30 |
+
"model_max_length": 512,
|
31 |
+
"pad_token": "<pad>",
|
32 |
+
"separate_vocabs": false,
|
33 |
+
"source_lang": "eng",
|
34 |
+
"sp_model_kwargs": {},
|
35 |
+
"target_lang": "hin",
|
36 |
+
"tokenizer_class": "MarianTokenizer",
|
37 |
+
"unk_token": "<unk>"
|
38 |
+
}
|
models/training_args.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7097011bb71f66fc1e7c1189e1a810f31a4375bb0f420b9cfa97c692271d5b9b
|
3 |
+
size 5368
|
models/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
Binary file (114 Bytes). View file
|
|