File size: 2,745 Bytes
4913387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8dfbb39
 
 
 
 
 
 
 
 
 
 
4913387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef05987
4913387
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Split text to sentences.

Modifed from seg_text.seg_text.py, vtext sentence_splitter removed, use

Use sentence_splitter if supported,
else use polyglot.text.Text

!apt install libicu-dev
!install pyicu pycld2
!pip install polyglot sentence_splitter

Use vtext and fastlid to rid of polyglot?

from vtext.tokenize_sentence import UnicodeSentenceTokenizer, PunctuationTokenizer
tok = UnicodeSentenceTokenizer()
seg = tok.tokenize(''' Text ''') for langs not in LANG_S

"""
# pylint: disable=invalid-name

import re
from typing import List, Optional, Union

import pysbd
from fastlid import fastlid
from loguru import logger
from tqdm.auto import tqdm


def _seg_text(
    text: str,
    lang: Optional[str] = None,
) -> List[str]:
    """
    Split text to sentences.

    Switched to pysbd

    Args:
    ----
    text: string to split
    lang: language, two-letter ISO (22 languages)

    Returns:
    -------
    List of segmented sentences

    """
    if lang is None:
        try:
            lang, _ = fastlid(text)
        except Exception as exc:
            logger.warning(" fastlid: %s, setting lang='en'", exc)
            lang = "en"

    if not text.strip():
        return []

    # pysbd only understands {'ja', 'am', 'bg', 'ur', 'hi', 'de', 'da', 'fr', 'el', 'fa', 'ru', 'ar', 'my', 'kk', 'pl', 'sk', 'en', 'hy', 'zh', 'mr', 'nl', 'it', 'es'}, 23
    try:
        seg = pysbd.Segmenter(language=lang, clean=True)
    except Exception as exc:
        # fall back to 'en'
        logger.error(exc)
        logger.warning(
            f" pysbd probably does not understand {lang} "
            "fall back to 'en'"
        )
        seg = pysbd.Segmenter(language="en", clean=True)

    try:
        # _ = tok.tokenize(text)
        _ = seg.segment(text)
    except Exception as exc:
        logger.exception(f"pysbd.Segmenter, {exc=}")
        raise
    return _


def seg_text(
    lst: Union[str, List[str]],
    lang: Optional[str] = None,
    maxlines: int = 1000,
    extra: Optional[str] = None,
) -> List[str]:
    """Split a list of text.

    Arguments:
        lst: text or text list
        lang: optional lang code
        maxlines: (default 1000), threshold for turn on tqdm progressbar, set to <1 or a large number to turn it off
        extra: re.split(rf"{extra}, text) first
    Returns:
        list of splitted text.
    """
    if isinstance(lst, str):
        lst = [lst]

    if extra:
        # insert \n
        lst = [re.sub(rf"({extra})", r"\1\n", elm) for elm in lst]

    res = []
    for elm in lst:
        res.extend(
            _seg_text(
                elm,
                lang=lang,
                # maxlines=maxlines,
                # flag=False,
            )
        )

    return res