Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
from dotenv import load_dotenv | |
from groq import Groq | |
import json | |
from typing import List, Dict | |
import time | |
# Load environment variables from .env file | |
load_dotenv() | |
# Initialize the Groq client | |
client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
class TranslationManager: | |
def __init__(self): | |
self.chunk_size = 1500 | |
self.overlap_size = 200 | |
self.context_window = [] | |
def chunk_text_with_context(self, text: str) -> List[Dict]: | |
"""Split text into chunks while maintaining context""" | |
words = text.split() | |
chunks = [] | |
current_chunk = [] | |
current_length = 0 | |
for i, word in enumerate(words): | |
current_chunk.append(word) | |
current_length += len(word) + 1 | |
# Check if chunk size is reached | |
if current_length >= self.chunk_size: | |
# Add overlap from next words if available | |
overlap_words = words[i+1:i+1+self.overlap_size] if i+1 < len(words) else [] | |
chunks.append({ | |
'main_text': ' '.join(current_chunk), | |
'overlap_text': ' '.join(overlap_words), | |
'position': len(chunks) | |
}) | |
# Start new chunk with some overlap | |
current_chunk = words[max(0, i-50):i+1] | |
current_length = sum(len(w) + 1 for w in current_chunk) | |
# Add remaining text as last chunk | |
if current_chunk: | |
chunks.append({ | |
'main_text': ' '.join(current_chunk), | |
'overlap_text': '', | |
'position': len(chunks) | |
}) | |
return chunks | |
def create_translation_prompt(self, chunk: Dict, mode: str, domain: str = None) -> str: | |
"""Create appropriate prompt based on translation mode""" | |
if mode == "normal": | |
prompt = f"""Translate the following English text to Tamil. | |
Provide only the Tamil translation without any other text. | |
English text: {chunk['main_text']}""" | |
else: # contextual | |
context = f"Domain: {domain}\n" if domain else "" | |
previous_context = self.context_window[-1] if self.context_window else "" | |
prompt = f"""Perform a contextual translation from English to Tamil. | |
Consider the following aspects: | |
{context} | |
Previous context: {previous_context} | |
Maintain the following in your translation: | |
- Preserve domain-specific terminology | |
- Maintain consistent style and tone | |
- Ensure contextual coherence with previous translations | |
- Adapt idiomatic expressions appropriately | |
Text to translate: {chunk['main_text']} | |
Overlap context: {chunk['overlap_text']} | |
Provide only the Tamil translation without any explanations.""" | |
return prompt | |
def translate_chunk(self, chunk: Dict, mode: str, domain: str = None) -> str: | |
"""Translate a single chunk of text""" | |
prompt = self.create_translation_prompt(chunk, mode, domain) | |
max_retries = 3 | |
for attempt in range(max_retries): | |
try: | |
completion = client.chat.completions.create( | |
model="Gemma2-9b-It", | |
messages=[ | |
{ | |
"role": "user", | |
"content": prompt | |
} | |
], | |
temperature=0.3 if mode == "normal" else 0.4, | |
max_tokens=2048, | |
top_p=1, | |
stream=True, | |
stop=None, | |
) | |
translation = "" | |
for chunk_response in completion: | |
translation += chunk_response.choices[0].delta.content or "" | |
# Update context window for contextual translation | |
if mode == "contextual": | |
self.context_window.append(translation) | |
if len(self.context_window) > 3: | |
self.context_window.pop(0) | |
return translation | |
except Exception as e: | |
if attempt == max_retries - 1: | |
raise e | |
time.sleep(2) # Wait before retry | |
return "" | |
def main(): | |
st.set_page_config(page_title="Advanced Tamil Translator", layout="wide") | |
# Initialize translation manager | |
if 'translation_manager' not in st.session_state: | |
st.session_state.translation_manager = TranslationManager() | |
if 'translation_history' not in st.session_state: | |
st.session_state.translation_history = [] | |
st.title("Advanced English to Tamil Translator") | |
# Translation settings | |
with st.expander("Translation Settings", expanded=True): | |
col1, col2 = st.columns(2) | |
with col1: | |
translation_mode = st.radio( | |
"Translation Mode", | |
["Normal", "Contextual"], | |
help="Normal: Direct translation\nContextual: Context-aware translation with domain specificity" | |
) | |
with col2: | |
if translation_mode == "Contextual": | |
domain = st.selectbox( | |
"Select Domain", | |
["General", "Technical", "Medical", "Legal", "Literary", "Business", "Academic"], | |
help="Select the domain to improve translation accuracy" | |
) | |
# Input area | |
st.subheader("Enter Text") | |
english_input = st.text_area("Enter English text of any length:", height=200) | |
# Translation button | |
if st.button("Translate"): | |
if not english_input: | |
st.error("Please enter some text to translate.") | |
return | |
try: | |
# Initialize progress tracking | |
progress_bar = st.progress(0) | |
status_text = st.empty() | |
# Reset context window for new translation | |
st.session_state.translation_manager.context_window = [] | |
# Chunk the input text | |
chunks = st.session_state.translation_manager.chunk_text_with_context(english_input) | |
translated_chunks = [] | |
# Translate each chunk | |
for i, chunk in enumerate(chunks): | |
status_text.text(f"Translating part {i+1} of {len(chunks)}...") | |
translation = st.session_state.translation_manager.translate_chunk( | |
chunk, | |
mode=translation_mode.lower(), | |
domain=domain if translation_mode == "Contextual" else None | |
) | |
translated_chunks.append(translation) | |
progress_bar.progress((i + 1) / len(chunks)) | |
# Combine translations | |
final_translation = ' '.join(translated_chunks) | |
# Display results | |
col1, col2 = st.columns(2) | |
with col1: | |
st.subheader("Original Text") | |
st.write(english_input) | |
st.info(f"Word count: {len(english_input.split())}") | |
with col2: | |
st.subheader("Tamil Translation") | |
st.write(final_translation) | |
# Add to history | |
st.session_state.translation_history.append({ | |
'english': english_input, | |
'tamil': final_translation, | |
'mode': translation_mode, | |
'domain': domain if translation_mode == "Contextual" else "N/A", | |
'timestamp': time.strftime("%Y-%m-%d %H:%M:%S") | |
}) | |
# Download options | |
col1, col2 = st.columns(2) | |
with col1: | |
st.download_button( | |
"Download Translation", | |
final_translation, | |
file_name=f"tamil_translation_{translation_mode.lower()}.txt", | |
mime="text/plain" | |
) | |
with col2: | |
# Export translation with metadata | |
export_data = { | |
'original': english_input, | |
'translation': final_translation, | |
'mode': translation_mode, | |
'domain': domain if translation_mode == "Contextual" else "N/A", | |
'timestamp': time.strftime("%Y-%m-%d %H:%M:%S") | |
} | |
st.download_button( | |
"Export with Metadata", | |
json.dumps(export_data, indent=2), | |
file_name="translation_with_metadata.json", | |
mime="application/json" | |
) | |
except Exception as e: | |
st.error(f"An error occurred: {str(e)}") | |
finally: | |
progress_bar.empty() | |
status_text.empty() | |
# Translation History | |
if st.session_state.translation_history: | |
with st.expander("Translation History"): | |
for i, entry in enumerate(reversed(st.session_state.translation_history[-5:])): | |
st.write(f"Translation {len(st.session_state.translation_history)-i}") | |
st.write(f"Mode: {entry['mode']}") | |
if entry['domain'] != "N/A": | |
st.write(f"Domain: {entry['domain']}") | |
st.write(f"Timestamp: {entry['timestamp']}") | |
st.write("English:", entry['english'][:100] + "..." if len(entry['english']) > 100 else entry['english']) | |
st.write("Tamil:", entry['tamil'][:100] + "..." if len(entry['tamil']) > 100 else entry['tamil']) | |
st.markdown("---") | |
if __name__ == "__main__": | |
main() |