Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -11,42 +11,95 @@ access_token = os.getenv('token')
|
|
11 |
# Set up device
|
12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
-
|
15 |
-
|
|
|
16 |
eng_trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
17 |
-
translator = pipeline('translation', model=trans_model, tokenizer=eng_trans_tokenizer, src_lang="eng_Latn", tgt_lang='sin_Sinh', max_length=400, device=device)
|
18 |
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
-
|
|
|
|
|
|
|
23 |
|
24 |
# Translation functions
|
25 |
def translate_Singlish_to_sinhala(text):
|
|
|
26 |
translated_text = singlish_pipe(f"translate Singlish to Sinhala: {text}", clean_up_tokenization_spaces=False)[0]['generated_text']
|
|
|
27 |
return translated_text
|
28 |
|
29 |
def translate_english_to_sinhala(text):
|
30 |
-
|
31 |
-
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
def translate_sinhala_to_english(text):
|
35 |
-
|
|
|
36 |
translated_parts = []
|
37 |
for part in parts:
|
38 |
-
|
|
|
|
|
39 |
outputs = sin_trans_model.generate(**inputs)
|
|
|
40 |
translated_part = si_trans_tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
41 |
translated_parts.append(translated_part)
|
42 |
-
|
|
|
|
|
43 |
|
44 |
def transliterate_from_sinhala(text):
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
def transliterate_to_sinhala(text):
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
# Placeholder for conversation model loading and pipeline setup
|
52 |
# pipe1 = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
|
|
|
11 |
# Set up device
|
12 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
13 |
|
14 |
+
chat_language = 'sin_Sinh'
|
15 |
+
|
16 |
+
trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
|
17 |
eng_trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
|
|
18 |
|
19 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
20 |
+
|
21 |
+
translator = pipeline('translation', model=trans_model, tokenizer=eng_trans_tokenizer, src_lang="eng_Latn", tgt_lang=chat_language, max_length = 400, device=device)
|
22 |
+
|
23 |
+
# Initialize translation pipelines
|
24 |
+
pipe = pipeline("translation", model="thilina/mt5-sinhalese-english")
|
25 |
+
|
26 |
+
trans_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
|
27 |
+
eng_trans_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
|
28 |
|
29 |
+
sin_trans_model = AutoModelForSeq2SeqLM.from_pretrained("thilina/mt5-sinhalese-english")
|
30 |
+
si_trans_tokenizer = AutoTokenizer.from_pretrained("thilina/mt5-sinhalese-english")
|
31 |
+
|
32 |
+
singlish_pipe = pipeline("text2text-generation", model="Dhahlan2000/Simple_Translation-model-for-GPT-v4")
|
33 |
|
34 |
# Translation functions
|
35 |
def translate_Singlish_to_sinhala(text):
|
36 |
+
|
37 |
translated_text = singlish_pipe(f"translate Singlish to Sinhala: {text}", clean_up_tokenization_spaces=False)[0]['generated_text']
|
38 |
+
|
39 |
return translated_text
|
40 |
|
41 |
def translate_english_to_sinhala(text):
|
42 |
+
# Split the text into sentences or paragraphs
|
43 |
+
parts = text.split("\n") # Split by new lines for paragraphs, adjust as needed
|
44 |
+
translated_parts = []
|
45 |
+
for part in parts:
|
46 |
+
translated_part = translator(part, clean_up_tokenization_spaces=False)[0]['translation_text']
|
47 |
+
translated_parts.append(translated_part)
|
48 |
+
# Join the translated parts back together
|
49 |
+
translated_text = "\n".join(translated_parts)
|
50 |
+
return translated_text.replace("ප් රභූවරුන්", "")
|
51 |
|
52 |
def translate_sinhala_to_english(text):
|
53 |
+
# Split the text into sentences or paragraphs
|
54 |
+
parts = text.split("\n") # Split by new lines for paragraphs, adjust as needed
|
55 |
translated_parts = []
|
56 |
for part in parts:
|
57 |
+
# Tokenize each part
|
58 |
+
inputs = si_trans_tokenizer(part.strip(), return_tensors="pt", padding=True, truncation=True, max_length=512)
|
59 |
+
# Generate translation
|
60 |
outputs = sin_trans_model.generate(**inputs)
|
61 |
+
# Decode translated text while preserving formatting
|
62 |
translated_part = si_trans_tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
|
63 |
translated_parts.append(translated_part)
|
64 |
+
# Join the translated parts back together
|
65 |
+
translated_text = "\n".join(translated_parts)
|
66 |
+
return translated_text
|
67 |
|
68 |
def transliterate_from_sinhala(text):
|
69 |
+
# Define the source and target scripts
|
70 |
+
source_script = 'Sinhala'
|
71 |
+
target_script = 'Velthuis'
|
72 |
+
|
73 |
+
# Perform transliteration
|
74 |
+
latin_text = transliterate.process(source_script, target_script, text)
|
75 |
+
|
76 |
+
# Convert to a list to allow modification
|
77 |
+
latin_text_list = list(latin_text)
|
78 |
+
|
79 |
+
# Replace periods with the following character
|
80 |
+
i = 0
|
81 |
+
for i in range(len(latin_text_list) - 1):
|
82 |
+
if latin_text_list[i] == '.':
|
83 |
+
latin_text_list[i] = ''
|
84 |
+
if latin_text_list[i] == '*':
|
85 |
+
latin_text_list[i] = ''
|
86 |
+
if latin_text_list[i] == '\"':
|
87 |
+
latin_text_list[i] = ''
|
88 |
+
|
89 |
+
# Convert back to a string
|
90 |
+
latin_text = ''.join(latin_text_list)
|
91 |
+
|
92 |
+
return latin_text.lower()
|
93 |
|
94 |
def transliterate_to_sinhala(text):
|
95 |
+
|
96 |
+
# Define the source and target scripts
|
97 |
+
source_script = 'Velthuis'
|
98 |
+
target_script = 'Sinhala'
|
99 |
+
|
100 |
+
# Perform transliteration
|
101 |
+
latin_text = transliterate.process(source_script, target_script, text)
|
102 |
+
return latin_text
|
103 |
|
104 |
# Placeholder for conversation model loading and pipeline setup
|
105 |
# pipe1 = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
|