Spaces:
Configuration error
Configuration error
import os | |
import re | |
from openai import OpenAI | |
client = OpenAI(api_key=os.environ.get('OPENAI_KEY')) | |
from tqdm import tqdm | |
import argparse | |
# Set your OpenAI API key | |
# List of target languages | |
languages = [ | |
"Spanish", "French", "German", "Italian", "Portuguese", "Arabic", "Japanese", | |
"Indonesian", "Swedish", "Danish", "Korean", "Polish", "Thai", "Chinese", "Vietnamese" | |
] | |
def read_sbv_file(file_path): | |
with open(file_path, 'r', encoding='utf-8') as file: | |
return file.readlines() | |
def write_sbv_file(file_path, lines): | |
with open(file_path, 'w', encoding='utf-8') as file: | |
file.writelines(lines) | |
def translate_text(text, target_language): | |
response = client.chat.completions.create( | |
model="gpt-4o-mini", | |
messages=[ | |
{ | |
"role": "system", | |
"content": f"You are a professional translator. Translate the following text to {target_language}. Preserve the original formatting and line breaks." | |
}, | |
{"role": "user", "content": text} | |
] | |
) | |
return response.choices[0].message.content | |
def get_base_filename(input_file): | |
return os.path.splitext(os.path.basename(input_file))[0] | |
def create_output_directory(output_dir): | |
os.makedirs(output_dir, exist_ok=True) | |
def parse_sbv_file(lines): | |
blocks = [] | |
i = 0 | |
while i < len(lines): | |
timestamp, text, i = parse_block(lines, i) | |
if timestamp: | |
blocks.append({'timestamp': timestamp, 'text': text}) | |
return blocks | |
def parse_block(lines, i): | |
line = lines[i].strip() | |
if re.match(r'^\d{1,2}:\d{2}:\d{2}\.\d{3},\d{1,2}:\d{2}:\d{2}\.\d{3}$', line): | |
timestamp = line | |
i += 1 | |
text, i = parse_text_lines(lines, i) | |
return timestamp, text, i | |
return None, None, i + 1 | |
def parse_text_lines(lines, i): | |
text_lines = [] | |
while i < len(lines) and lines[i].strip() != '': | |
text_lines.append(lines[i]) | |
i += 1 | |
while i < len(lines) and lines[i].strip() == '': | |
i += 1 | |
return ''.join(text_lines).strip(), i | |
def translate_blocks(blocks, target_language): | |
return [translate_block(block, target_language) for block in tqdm(blocks, desc=f"Translating blocks for {target_language}")] | |
def translate_block(block, target_language): | |
translated_text = translate_text(block['text'], target_language) if block['text'] else '' | |
return {'timestamp': block['timestamp'], 'text': translated_text} | |
def reconstruct_sbv_content(blocks): | |
output_lines = [] | |
for block in blocks: | |
output_lines.append(block['timestamp'] + '\n') | |
if block['text']: | |
output_lines.append(block['text'] + '\n') | |
output_lines.append('\n') | |
return output_lines | |
def get_output_file_path(output_dir, base_filename, lang): | |
return os.path.join(output_dir, f"{base_filename}_{lang}.sbv") | |
def translate_to_language(input_file, output_dir, language): | |
""" | |
Translate an SBV subtitle file to a single language. | |
Args: | |
input_file (str): Path to the input SBV file. | |
output_dir (str): Path to the directory where translated file will be saved. | |
language (str): Target language for translation. | |
Returns: | |
str: Path to the translated output file. | |
""" | |
lines = read_sbv_file(input_file) | |
base_filename = get_base_filename(input_file) | |
create_output_directory(output_dir) | |
blocks = parse_sbv_file(lines) | |
print(f"\nStarting translation for language: {language}") | |
translated_blocks = translate_blocks(blocks, language) | |
output_content = reconstruct_sbv_content(translated_blocks) | |
output_file = get_output_file_path(output_dir, base_filename, language) | |
write_sbv_file(output_file, output_content) | |
print(f"Translated {language} file saved: {output_file}") | |
return output_file | |
def translate_to_languages(input_file, output_dir, selected_languages): | |
""" | |
Translate an SBV subtitle file to multiple languages. | |
Args: | |
input_file (str): Path to the input SBV file. | |
output_dir (str): Path to the directory where translated files will be saved. | |
selected_languages (list): List of target languages for translation. | |
Returns: | |
list: Paths to the translated output files. | |
""" | |
create_output_directory(output_dir) | |
translated_files = [] | |
for lang in tqdm(selected_languages, desc="Translating languages"): | |
output_file = translate_to_language(input_file, output_dir, lang) | |
translated_files.append(output_file) | |
return translated_files | |
# Update the main block to use the new functions | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Translate SBV subtitle files to multiple languages.") | |
parser.add_argument("input_file", help="Path to the input .sbv file") | |
parser.add_argument("output_dir", help="Path to the output directory") | |
parser.add_argument("--languages", nargs="+", choices=languages, default=languages, help="Languages to translate to") | |
args = parser.parse_args() | |
translated_files = translate_to_languages(args.input_file, args.output_dir, args.languages) | |
print(f"\nTranslation completed. {len(translated_files)} files created.") |