File size: 2,426 Bytes
9ba7d3b
 
b3ef6db
 
 
 
 
 
9ba7d3b
b3ef6db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ba7d3b
 
 
 
 
 
 
 
 
 
b3ef6db
9ba7d3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import re

one = ["", "one ", "two ", "three ", "four ",
       "five ", "six ", "seven ", "eight ",
       "nine ", "ten ", "eleven ", "twelve ",
       "thirteen ", "fourteen ", "fifteen ",
       "sixteen ", "seventeen ", "eighteen ",
       "nineteen "]

# strings at index 0 and 1 are not used,
# they are to make array indexing simple
ten = ["", "", "twenty ", "thirty ", "forty ",
       "fifty ", "sixty ", "seventy ", "eighty ",
       "ninety "]


def numToWords(n, s):
    str = ""

    if n <= 19:
        str += one[n]
    # if n is more than 19, divide it
    else:
        str += ten[n // 10] + one[n % 10]

    # if n is non-zero
    if (n):
        str += s

    return str


def intToWord(n):
    n=int(n)
    out = ""

    out += numToWords((n // 10000000),
                      "crore ")

    out += numToWords(((n // 100000) % 100),
                      "lakh ")

    out += numToWords(((n // 1000) % 100),
                      "thousand ")

    out += numToWords(((n // 100) % 10),
                      "hundred ")

    if n > 100 and n % 100:
        out += "and "

    # handles digits at ones and tens
    # places (if any)
    out += numToWords((n % 100), "")

    return out.strip()


def preprocess_text(text):
    """

    Приведение к нормальному виду с отделенными точками и запятыми

    srs:

        Hello, World! This is a sample text with numbers 12345 and symbols #$%.

    return:

        ['HELLO', ',', 'WORLD', 'THIS', 'IS', 'A', 'SAMPLE', 'TEXT', 'WITH', 'NUMBERS', 'AND', 'SYMBOLS', '.']

    """
    if not (text.isspace()) and text and text:

        text = text.upper()
        text = re.sub(r'([.,])', r' \1 ', text)

        text = re.sub(r'[^A-Z .,^0-9]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()

        text = text.split()
        result = []
        for word in text:
            if word.isdigit():
                result = result + (intToWord(word).upper()).split()
            else:
                result.append(word)
    else:
        result = ['текст введи :(']

    return result


if __name__ == "__main__":
    sample_text = "Hello, World! This is a sample text with numbers 12345 and symbols #$%."
    processed_text = preprocess_text(sample_text)
    print("Processed text:", processed_text)