File size: 1,589 Bytes
a233921
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import re

def read_srt_text(file_path: str) -> str:
    """
    Read an SRT file and extract only the text content, ignoring timestamps.

    Args:
        file_path (str): Path to the SRT file

    Returns:
        str: Concatenated text content from the SRT file
    """
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    # Split content into subtitle blocks
    blocks = content.strip().split("\n\n")

    # Extract only the text lines (not numbers or timestamps)
    text_lines = []
    for block in blocks:
        lines = block.split("\n")
        # Skip the subtitle number and timestamp lines
        text = " ".join(lines[2:])  # Join all lines after timestamp
        text_lines.append(text)

    return " ".join(text_lines)

def preprocess_chinese_text(text: str, include_punctuation: bool = False) -> str:
    """
    Preprocess Chinese text for CER calculation.

    Args:
        text (str): Input Chinese text
        include_punctuation (bool): Whether to include punctuation in the calculation

    Returns:
        str: Preprocessed text with characters separated by spaces
    """
    # Remove any English characters, numbers, and extra spaces
    text = re.sub(r"[a-zA-Z0-9\s]+", "", text)

    if not include_punctuation:
        # Remove both Chinese and English punctuation with properly escaped characters
        text = re.sub(
            r'[,。!?:;""' "()【】《》、,\.!?:;\"'\\(\\)\\[\\]\\{\\}]", "", text
        )

    # Convert to list of characters and join with spaces
    return " ".join(list(text))