Dhahlan2000 commited on
Commit
d63d2a9
·
verified ·
1 Parent(s): 36040e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -15
app.py CHANGED
@@ -11,42 +11,95 @@ access_token = os.getenv('token')
11
  # Set up device
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
- # Load translation models and tokenizers
15
- trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M").to(device)
 
16
  eng_trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
17
- translator = pipeline('translation', model=trans_model, tokenizer=eng_trans_tokenizer, src_lang="eng_Latn", tgt_lang='sin_Sinh', max_length=400, device=device)
18
 
19
- sin_trans_model = AutoModelForSeq2SeqLM.from_pretrained("thilina/mt5-sinhalese-english").to(device)
20
- si_trans_tokenizer = AutoTokenizer.from_pretrained("thilina/mt5-sinhalese-english", use_fast=False)
 
 
 
 
 
 
 
21
 
22
- singlish_pipe = pipeline("text2text-generation", model="Dhahlan2000/Simple_Translation-model-for-GPT-v14")
 
 
 
23
 
24
  # Translation functions
25
  def translate_Singlish_to_sinhala(text):
 
26
  translated_text = singlish_pipe(f"translate Singlish to Sinhala: {text}", clean_up_tokenization_spaces=False)[0]['generated_text']
 
27
  return translated_text
28
 
29
  def translate_english_to_sinhala(text):
30
- parts = text.split("\n")
31
- translated_parts = [translator(part, clean_up_tokenization_spaces=False)[0]['translation_text'] for part in parts]
32
- return "\n".join(translated_parts).replace("ප් රභූවරුන්", "")
 
 
 
 
 
 
33
 
34
  def translate_sinhala_to_english(text):
35
- parts = text.split("\n")
 
36
  translated_parts = []
37
  for part in parts:
38
- inputs = si_trans_tokenizer(part.strip(), return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
 
 
39
  outputs = sin_trans_model.generate(**inputs)
 
40
  translated_part = si_trans_tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
41
  translated_parts.append(translated_part)
42
- return "\n".join(translated_parts)
 
 
43
 
44
  def transliterate_from_sinhala(text):
45
- latin_text = transliterate.process('Sinhala', 'Velthuis', text).replace('.', '').replace('*', '').replace('"', '').lower()
46
- return latin_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  def transliterate_to_sinhala(text):
49
- return transliterate.process('Velthuis', 'Sinhala', text)
 
 
 
 
 
 
 
50
 
51
  # Placeholder for conversation model loading and pipeline setup
52
  # pipe1 = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)
 
11
  # Set up device
12
  device = "cuda" if torch.cuda.is_available() else "cpu"
13
 
14
+ chat_language = 'sin_Sinh'
15
+
16
+ trans_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
17
  eng_trans_tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
 
18
 
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+
21
+ translator = pipeline('translation', model=trans_model, tokenizer=eng_trans_tokenizer, src_lang="eng_Latn", tgt_lang=chat_language, max_length = 400, device=device)
22
+
23
+ # Initialize translation pipelines
24
+ pipe = pipeline("translation", model="thilina/mt5-sinhalese-english")
25
+
26
+ trans_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-one-to-many-mmt")
27
+ eng_trans_tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-one-to-many-mmt", src_lang="en_XX")
28
 
29
+ sin_trans_model = AutoModelForSeq2SeqLM.from_pretrained("thilina/mt5-sinhalese-english")
30
+ si_trans_tokenizer = AutoTokenizer.from_pretrained("thilina/mt5-sinhalese-english")
31
+
32
+ singlish_pipe = pipeline("text2text-generation", model="Dhahlan2000/Simple_Translation-model-for-GPT-v4")
33
 
34
  # Translation functions
35
  def translate_Singlish_to_sinhala(text):
36
+
37
  translated_text = singlish_pipe(f"translate Singlish to Sinhala: {text}", clean_up_tokenization_spaces=False)[0]['generated_text']
38
+
39
  return translated_text
40
 
41
  def translate_english_to_sinhala(text):
42
+ # Split the text into sentences or paragraphs
43
+ parts = text.split("\n") # Split by new lines for paragraphs, adjust as needed
44
+ translated_parts = []
45
+ for part in parts:
46
+ translated_part = translator(part, clean_up_tokenization_spaces=False)[0]['translation_text']
47
+ translated_parts.append(translated_part)
48
+ # Join the translated parts back together
49
+ translated_text = "\n".join(translated_parts)
50
+ return translated_text.replace("ප් රභූවරුන්", "")
51
 
52
  def translate_sinhala_to_english(text):
53
+ # Split the text into sentences or paragraphs
54
+ parts = text.split("\n") # Split by new lines for paragraphs, adjust as needed
55
  translated_parts = []
56
  for part in parts:
57
+ # Tokenize each part
58
+ inputs = si_trans_tokenizer(part.strip(), return_tensors="pt", padding=True, truncation=True, max_length=512)
59
+ # Generate translation
60
  outputs = sin_trans_model.generate(**inputs)
61
+ # Decode translated text while preserving formatting
62
  translated_part = si_trans_tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
63
  translated_parts.append(translated_part)
64
+ # Join the translated parts back together
65
+ translated_text = "\n".join(translated_parts)
66
+ return translated_text
67
 
68
  def transliterate_from_sinhala(text):
69
+ # Define the source and target scripts
70
+ source_script = 'Sinhala'
71
+ target_script = 'Velthuis'
72
+
73
+ # Perform transliteration
74
+ latin_text = transliterate.process(source_script, target_script, text)
75
+
76
+ # Convert to a list to allow modification
77
+ latin_text_list = list(latin_text)
78
+
79
+ # Replace periods with the following character
80
+ i = 0
81
+ for i in range(len(latin_text_list) - 1):
82
+ if latin_text_list[i] == '.':
83
+ latin_text_list[i] = ''
84
+ if latin_text_list[i] == '*':
85
+ latin_text_list[i] = ''
86
+ if latin_text_list[i] == '\"':
87
+ latin_text_list[i] = ''
88
+
89
+ # Convert back to a string
90
+ latin_text = ''.join(latin_text_list)
91
+
92
+ return latin_text.lower()
93
 
94
  def transliterate_to_sinhala(text):
95
+
96
+ # Define the source and target scripts
97
+ source_script = 'Velthuis'
98
+ target_script = 'Sinhala'
99
+
100
+ # Perform transliteration
101
+ latin_text = transliterate.process(source_script, target_script, text)
102
+ return latin_text
103
 
104
  # Placeholder for conversation model loading and pipeline setup
105
  # pipe1 = pipeline("text-generation", model="microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True)