import re def read_srt_text(file_path: str) -> str: """ Read an SRT file and extract only the text content, ignoring timestamps. Args: file_path (str): Path to the SRT file Returns: str: Concatenated text content from the SRT file """ with open(file_path, "r", encoding="utf-8") as f: content = f.read() # Split content into subtitle blocks blocks = content.strip().split("\n\n") # Extract only the text lines (not numbers or timestamps) text_lines = [] for block in blocks: lines = block.split("\n") # Skip the subtitle number and timestamp lines text = " ".join(lines[2:]) # Join all lines after timestamp text_lines.append(text) return " ".join(text_lines) def preprocess_chinese_text(text: str, include_punctuation: bool = False) -> str: """ Preprocess Chinese text for CER calculation. Args: text (str): Input Chinese text include_punctuation (bool): Whether to include punctuation in the calculation Returns: str: Preprocessed text with characters separated by spaces """ # Remove any English characters, numbers, and extra spaces text = re.sub(r"[a-zA-Z0-9\s]+", "", text) if not include_punctuation: # Remove both Chinese and English punctuation with properly escaped characters text = re.sub( r'[,。!?:;""' "()【】《》、,\.!?:;\"'\\(\\)\\[\\]\\{\\}]", "", text ) # Convert to list of characters and join with spaces return " ".join(list(text))