yt_subtitle_translator / translator.py
Dien-Hoa
refactor translate 1 and multi languages
fa7018d
import os
import re
from openai import OpenAI
client = OpenAI(api_key=os.environ.get('OPENAI_KEY'))
from tqdm import tqdm
import argparse
# Set your OpenAI API key
# List of target languages
languages = [
"Spanish", "French", "German", "Italian", "Portuguese", "Arabic", "Japanese",
"Indonesian", "Swedish", "Danish", "Korean", "Polish", "Thai", "Chinese", "Vietnamese"
]
def read_sbv_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.readlines()
def write_sbv_file(file_path, lines):
with open(file_path, 'w', encoding='utf-8') as file:
file.writelines(lines)
def translate_text(text, target_language):
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
{
"role": "system",
"content": f"You are a professional translator. Translate the following text to {target_language}. Preserve the original formatting and line breaks."
},
{"role": "user", "content": text}
]
)
return response.choices[0].message.content
def get_base_filename(input_file):
return os.path.splitext(os.path.basename(input_file))[0]
def create_output_directory(output_dir):
os.makedirs(output_dir, exist_ok=True)
def parse_sbv_file(lines):
blocks = []
i = 0
while i < len(lines):
timestamp, text, i = parse_block(lines, i)
if timestamp:
blocks.append({'timestamp': timestamp, 'text': text})
return blocks
def parse_block(lines, i):
line = lines[i].strip()
if re.match(r'^\d{1,2}:\d{2}:\d{2}\.\d{3},\d{1,2}:\d{2}:\d{2}\.\d{3}$', line):
timestamp = line
i += 1
text, i = parse_text_lines(lines, i)
return timestamp, text, i
return None, None, i + 1
def parse_text_lines(lines, i):
text_lines = []
while i < len(lines) and lines[i].strip() != '':
text_lines.append(lines[i])
i += 1
while i < len(lines) and lines[i].strip() == '':
i += 1
return ''.join(text_lines).strip(), i
def translate_blocks(blocks, target_language):
return [translate_block(block, target_language) for block in tqdm(blocks, desc=f"Translating blocks for {target_language}")]
def translate_block(block, target_language):
translated_text = translate_text(block['text'], target_language) if block['text'] else ''
return {'timestamp': block['timestamp'], 'text': translated_text}
def reconstruct_sbv_content(blocks):
output_lines = []
for block in blocks:
output_lines.append(block['timestamp'] + '\n')
if block['text']:
output_lines.append(block['text'] + '\n')
output_lines.append('\n')
return output_lines
def get_output_file_path(output_dir, base_filename, lang):
return os.path.join(output_dir, f"{base_filename}_{lang}.sbv")
def translate_to_language(input_file, output_dir, language):
"""
Translate an SBV subtitle file to a single language.
Args:
input_file (str): Path to the input SBV file.
output_dir (str): Path to the directory where translated file will be saved.
language (str): Target language for translation.
Returns:
str: Path to the translated output file.
"""
lines = read_sbv_file(input_file)
base_filename = get_base_filename(input_file)
create_output_directory(output_dir)
blocks = parse_sbv_file(lines)
print(f"\nStarting translation for language: {language}")
translated_blocks = translate_blocks(blocks, language)
output_content = reconstruct_sbv_content(translated_blocks)
output_file = get_output_file_path(output_dir, base_filename, language)
write_sbv_file(output_file, output_content)
print(f"Translated {language} file saved: {output_file}")
return output_file
def translate_to_languages(input_file, output_dir, selected_languages):
"""
Translate an SBV subtitle file to multiple languages.
Args:
input_file (str): Path to the input SBV file.
output_dir (str): Path to the directory where translated files will be saved.
selected_languages (list): List of target languages for translation.
Returns:
list: Paths to the translated output files.
"""
create_output_directory(output_dir)
translated_files = []
for lang in tqdm(selected_languages, desc="Translating languages"):
output_file = translate_to_language(input_file, output_dir, lang)
translated_files.append(output_file)
return translated_files
# Update the main block to use the new functions
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Translate SBV subtitle files to multiple languages.")
parser.add_argument("input_file", help="Path to the input .sbv file")
parser.add_argument("output_dir", help="Path to the output directory")
parser.add_argument("--languages", nargs="+", choices=languages, default=languages, help="Languages to translate to")
args = parser.parse_args()
translated_files = translate_to_languages(args.input_file, args.output_dir, args.languages)
print(f"\nTranslation completed. {len(translated_files)} files created.")