arsath-sm commited on
Commit
dd17995
·
verified ·
1 Parent(s): 3a03290

Upload 3 files

Browse files
Files changed (3) hide show
  1. .env +1 -0
  2. requirements.txt +2 -0
  3. translator2.py +264 -0
.env ADDED
@@ -0,0 +1 @@
 
 
1
+ GROQ_API_KEY ="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"
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ groq
2
+ streamlit
translator2.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+ from dotenv import load_dotenv
4
+ from groq import Groq
5
+ import json
6
+ from typing import List, Dict
7
+ import time
8
+
9
+ # Load environment variables from .env file
10
+ load_dotenv()
11
+
12
+ # Initialize the Groq client
13
+ client = Groq(api_key=os.getenv("GROQ_API_KEY"))
14
+
15
+ class TranslationManager:
16
+ def __init__(self):
17
+ self.chunk_size = 1500
18
+ self.overlap_size = 200
19
+ self.context_window = []
20
+
21
+ def chunk_text_with_context(self, text: str) -> List[Dict]:
22
+ """Split text into chunks while maintaining context"""
23
+ words = text.split()
24
+ chunks = []
25
+ current_chunk = []
26
+ current_length = 0
27
+
28
+ for i, word in enumerate(words):
29
+ current_chunk.append(word)
30
+ current_length += len(word) + 1
31
+
32
+ # Check if chunk size is reached
33
+ if current_length >= self.chunk_size:
34
+ # Add overlap from next words if available
35
+ overlap_words = words[i+1:i+1+self.overlap_size] if i+1 < len(words) else []
36
+
37
+ chunks.append({
38
+ 'main_text': ' '.join(current_chunk),
39
+ 'overlap_text': ' '.join(overlap_words),
40
+ 'position': len(chunks)
41
+ })
42
+
43
+ # Start new chunk with some overlap
44
+ current_chunk = words[max(0, i-50):i+1]
45
+ current_length = sum(len(w) + 1 for w in current_chunk)
46
+
47
+ # Add remaining text as last chunk
48
+ if current_chunk:
49
+ chunks.append({
50
+ 'main_text': ' '.join(current_chunk),
51
+ 'overlap_text': '',
52
+ 'position': len(chunks)
53
+ })
54
+
55
+ return chunks
56
+
57
+ def create_translation_prompt(self, chunk: Dict, mode: str, domain: str = None) -> str:
58
+ """Create appropriate prompt based on translation mode"""
59
+ if mode == "normal":
60
+ prompt = f"""Translate the following English text to Tamil.
61
+ Provide only the Tamil translation without any other text.
62
+
63
+ English text: {chunk['main_text']}"""
64
+ else: # contextual
65
+ context = f"Domain: {domain}\n" if domain else ""
66
+ previous_context = self.context_window[-1] if self.context_window else ""
67
+
68
+ prompt = f"""Perform a contextual translation from English to Tamil.
69
+ Consider the following aspects:
70
+ {context}
71
+ Previous context: {previous_context}
72
+
73
+ Maintain the following in your translation:
74
+ - Preserve domain-specific terminology
75
+ - Maintain consistent style and tone
76
+ - Ensure contextual coherence with previous translations
77
+ - Adapt idiomatic expressions appropriately
78
+
79
+ Text to translate: {chunk['main_text']}
80
+
81
+ Overlap context: {chunk['overlap_text']}
82
+
83
+ Provide only the Tamil translation without any explanations."""
84
+
85
+ return prompt
86
+
87
+ def translate_chunk(self, chunk: Dict, mode: str, domain: str = None) -> str:
88
+ """Translate a single chunk of text"""
89
+ prompt = self.create_translation_prompt(chunk, mode, domain)
90
+
91
+ max_retries = 3
92
+ for attempt in range(max_retries):
93
+ try:
94
+ completion = client.chat.completions.create(
95
+ model="Gemma2-9b-It",
96
+ messages=[
97
+ {
98
+ "role": "user",
99
+ "content": prompt
100
+ }
101
+ ],
102
+ temperature=0.3 if mode == "normal" else 0.4,
103
+ max_tokens=2048,
104
+ top_p=1,
105
+ stream=True,
106
+ stop=None,
107
+ )
108
+
109
+ translation = ""
110
+ for chunk_response in completion:
111
+ translation += chunk_response.choices[0].delta.content or ""
112
+
113
+ # Update context window for contextual translation
114
+ if mode == "contextual":
115
+ self.context_window.append(translation)
116
+ if len(self.context_window) > 3:
117
+ self.context_window.pop(0)
118
+
119
+ return translation
120
+
121
+ except Exception as e:
122
+ if attempt == max_retries - 1:
123
+ raise e
124
+ time.sleep(2) # Wait before retry
125
+
126
+ return ""
127
+
128
+ def main():
129
+ st.set_page_config(page_title="Advanced Tamil Translator", layout="wide")
130
+
131
+ # Initialize translation manager
132
+ if 'translation_manager' not in st.session_state:
133
+ st.session_state.translation_manager = TranslationManager()
134
+
135
+ if 'translation_history' not in st.session_state:
136
+ st.session_state.translation_history = []
137
+
138
+ st.title("Advanced English to Tamil Translator")
139
+
140
+ # Translation settings
141
+ with st.expander("Translation Settings", expanded=True):
142
+ col1, col2 = st.columns(2)
143
+ with col1:
144
+ translation_mode = st.radio(
145
+ "Translation Mode",
146
+ ["Normal", "Contextual"],
147
+ help="Normal: Direct translation\nContextual: Context-aware translation with domain specificity"
148
+ )
149
+
150
+ with col2:
151
+ if translation_mode == "Contextual":
152
+ domain = st.selectbox(
153
+ "Select Domain",
154
+ ["General", "Technical", "Medical", "Legal", "Literary", "Business", "Academic"],
155
+ help="Select the domain to improve translation accuracy"
156
+ )
157
+
158
+ # Input area
159
+ st.subheader("Enter Text")
160
+ english_input = st.text_area("Enter English text of any length:", height=200)
161
+
162
+ # Translation button
163
+ if st.button("Translate"):
164
+ if not english_input:
165
+ st.error("Please enter some text to translate.")
166
+ return
167
+
168
+ try:
169
+ # Initialize progress tracking
170
+ progress_bar = st.progress(0)
171
+ status_text = st.empty()
172
+
173
+ # Reset context window for new translation
174
+ st.session_state.translation_manager.context_window = []
175
+
176
+ # Chunk the input text
177
+ chunks = st.session_state.translation_manager.chunk_text_with_context(english_input)
178
+ translated_chunks = []
179
+
180
+ # Translate each chunk
181
+ for i, chunk in enumerate(chunks):
182
+ status_text.text(f"Translating part {i+1} of {len(chunks)}...")
183
+
184
+ translation = st.session_state.translation_manager.translate_chunk(
185
+ chunk,
186
+ mode=translation_mode.lower(),
187
+ domain=domain if translation_mode == "Contextual" else None
188
+ )
189
+
190
+ translated_chunks.append(translation)
191
+ progress_bar.progress((i + 1) / len(chunks))
192
+
193
+ # Combine translations
194
+ final_translation = ' '.join(translated_chunks)
195
+
196
+ # Display results
197
+ col1, col2 = st.columns(2)
198
+
199
+ with col1:
200
+ st.subheader("Original Text")
201
+ st.write(english_input)
202
+ st.info(f"Word count: {len(english_input.split())}")
203
+
204
+ with col2:
205
+ st.subheader("Tamil Translation")
206
+ st.write(final_translation)
207
+
208
+ # Add to history
209
+ st.session_state.translation_history.append({
210
+ 'english': english_input,
211
+ 'tamil': final_translation,
212
+ 'mode': translation_mode,
213
+ 'domain': domain if translation_mode == "Contextual" else "N/A",
214
+ 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
215
+ })
216
+
217
+ # Download options
218
+ col1, col2 = st.columns(2)
219
+ with col1:
220
+ st.download_button(
221
+ "Download Translation",
222
+ final_translation,
223
+ file_name=f"tamil_translation_{translation_mode.lower()}.txt",
224
+ mime="text/plain"
225
+ )
226
+
227
+ with col2:
228
+ # Export translation with metadata
229
+ export_data = {
230
+ 'original': english_input,
231
+ 'translation': final_translation,
232
+ 'mode': translation_mode,
233
+ 'domain': domain if translation_mode == "Contextual" else "N/A",
234
+ 'timestamp': time.strftime("%Y-%m-%d %H:%M:%S")
235
+ }
236
+ st.download_button(
237
+ "Export with Metadata",
238
+ json.dumps(export_data, indent=2),
239
+ file_name="translation_with_metadata.json",
240
+ mime="application/json"
241
+ )
242
+
243
+ except Exception as e:
244
+ st.error(f"An error occurred: {str(e)}")
245
+
246
+ finally:
247
+ progress_bar.empty()
248
+ status_text.empty()
249
+
250
+ # Translation History
251
+ if st.session_state.translation_history:
252
+ with st.expander("Translation History"):
253
+ for i, entry in enumerate(reversed(st.session_state.translation_history[-5:])):
254
+ st.write(f"Translation {len(st.session_state.translation_history)-i}")
255
+ st.write(f"Mode: {entry['mode']}")
256
+ if entry['domain'] != "N/A":
257
+ st.write(f"Domain: {entry['domain']}")
258
+ st.write(f"Timestamp: {entry['timestamp']}")
259
+ st.write("English:", entry['english'][:100] + "..." if len(entry['english']) > 100 else entry['english'])
260
+ st.write("Tamil:", entry['tamil'][:100] + "..." if len(entry['tamil']) > 100 else entry['tamil'])
261
+ st.markdown("---")
262
+
263
+ if __name__ == "__main__":
264
+ main()