Chitti-v2

Sleeping

App Files Files Community

Dhahlan2000 commited on Jun 10, 2024

Commit

d63d2a9

verified ·

1 Parent(s): 36040e4

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -15

app.py CHANGED Viewed

@@ -11,42 +11,95 @@ access_token = os.getenv('token')
 # Set up device
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load translation models and tokenizers
-trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to(device)
 eng_trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
-translator = pipeline('translation', model=trans_model, tokenizer=eng_trans_tokenizer, src_lang="eng_Latn", tgt_lang='sin_Sinh', max_length=400, device=device)
-sin_trans_model = AutoModelForSeq2SeqLM.from_pretrained("thilina/mt5-sinhalese-english").to(device)
-si_trans_tokenizer = AutoTokenizer.from_pretrained("thilina/mt5-sinhalese-english", use_fast=False)
-singlish_pipe = pipeline("text2text-generation", model="Dhahlan2000/Simple_Translation-model-for-GPT-v14")
 # Translation functions
 def translate_Singlish_to_sinhala(text):
     translated_text = singlish_pipe(f"translate Singlish to Sinhala: {text}", clean_up_tokenization_spaces=False)[0]['generated_text']
     return translated_text
 def translate_english_to_sinhala(text):
-    parts = text.split("\n")
-    translated_parts = [translator(part, clean_up_tokenization_spaces=False)[0]['translation_text'] for part in parts]
-    return "\n".join(translated_parts).replace("ප් රභූවරුන්", "")
 def translate_sinhala_to_english(text):
-    parts = text.split("\n")
     translated_parts = []
     for part in parts:
-        inputs = si_trans_tokenizer(part.strip(), return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
         outputs = sin_trans_model.generate(**inputs)
         translated_part = si_trans_tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
         translated_parts.append(translated_part)
-    return "\n".join(translated_parts)
 def transliterate_from_sinhala(text):
-    latin_text = transliterate.process('Sinhala', 'Velthuis', text).replace('.', '').replace('*', '').replace('"', '').lower()
-    return latin_text
 def transliterate_to_sinhala(text):
-    return transliterate.process('Velthuis', 'Sinhala', text)
 # Placeholder for conversation model loading and pipeline setup
 # pipe1 = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)

 # Set up device
 device = "cuda" if torch.cuda.is_available() else "cpu"
+chat_language = 'sin_Sinh'
+trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
 eng_trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+translator = pipeline('translation', model=trans_model, tokenizer=eng_trans_tokenizer, src_lang="eng_Latn", tgt_lang=chat_language, max_length = 400, device=device)
+# Initialize translation pipelines
+pipe = pipeline("translation", model="thilina/mt5-sinhalese-english")
+trans_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
+eng_trans_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
+sin_trans_model = AutoModelForSeq2SeqLM.from_pretrained("thilina/mt5-sinhalese-english")
+si_trans_tokenizer = AutoTokenizer.from_pretrained("thilina/mt5-sinhalese-english")
+singlish_pipe = pipeline("text2text-generation", model="Dhahlan2000/Simple_Translation-model-for-GPT-v4")
 # Translation functions
 def translate_Singlish_to_sinhala(text):
     translated_text = singlish_pipe(f"translate Singlish to Sinhala: {text}", clean_up_tokenization_spaces=False)[0]['generated_text']
     return translated_text
 def translate_english_to_sinhala(text):
+    # Split the text into sentences or paragraphs
+    parts = text.split("\n")  # Split by new lines for paragraphs, adjust as needed
+    translated_parts = []
+    for part in parts:
+        translated_part = translator(part, clean_up_tokenization_spaces=False)[0]['translation_text']
+        translated_parts.append(translated_part)
+    # Join the translated parts back together
+    translated_text = "\n".join(translated_parts)
+    return translated_text.replace("ප් රභූවරුන්", "")
 def translate_sinhala_to_english(text):
+    # Split the text into sentences or paragraphs
+    parts = text.split("\n")  # Split by new lines for paragraphs, adjust as needed
     translated_parts = []
     for part in parts:
+        # Tokenize each part
+        inputs = si_trans_tokenizer(part.strip(), return_tensors="pt", padding=True, truncation=True, max_length=512)
+        # Generate translation
         outputs = sin_trans_model.generate(**inputs)
+        # Decode translated text while preserving formatting
         translated_part = si_trans_tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
         translated_parts.append(translated_part)
+    # Join the translated parts back together
+    translated_text = "\n".join(translated_parts)
+    return translated_text
 def transliterate_from_sinhala(text):
+    # Define the source and target scripts
+    source_script = 'Sinhala'
+    target_script = 'Velthuis'
+    # Perform transliteration
+    latin_text = transliterate.process(source_script, target_script, text)
+    # Convert to a list to allow modification
+    latin_text_list = list(latin_text)
+    # Replace periods with the following character
+    i = 0
+    for i in range(len(latin_text_list) - 1):
+        if latin_text_list[i] == '.':
+            latin_text_list[i] = ''
+        if latin_text_list[i] == '*':
+            latin_text_list[i] = ''
+        if latin_text_list[i] == '\"':
+            latin_text_list[i] = ''
+    # Convert back to a string
+    latin_text = ''.join(latin_text_list)
+    return latin_text.lower()
 def transliterate_to_sinhala(text):
+  # Define the source and target scripts
+  source_script = 'Velthuis'
+  target_script = 'Sinhala'
+  # Perform transliteration
+  latin_text = transliterate.process(source_script, target_script, text)
+  return latin_text
 # Placeholder for conversation model loading and pipeline setup
 # pipe1 = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)