cdactvm commited on
Commit
2b75669
·
verified ·
1 Parent(s): 02bd48a

Update text2int.py

Browse files
Files changed (1) hide show
  1. text2int.py +88 -102
text2int.py CHANGED
@@ -1,102 +1,88 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # In[3]:
5
-
6
-
7
- import nbimporter
8
- from isNumber import is_number # Remove or replace this if unnecessary
9
-
10
- def text_to_int(textnum, numwords={}):
11
- # Define units, tens, and scales including "lac"
12
- units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
13
- 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
14
- 'sixteen', 'seventeen', 'eighteen', 'nineteen']
15
- tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
16
- scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # "lac" added
17
- ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
18
- ordinal_endings = [('ieth', 'y'), ('th', '')]
19
-
20
- if not numwords:
21
- numwords['and'] = (1, 0) # Handle "one hundred and twenty"
22
-
23
- # Add units, tens, and scales to numwords
24
- for idx, word in enumerate(units):
25
- numwords[word] = (1, idx)
26
- for idx, word in enumerate(tens):
27
- numwords[word] = (1, idx * 10)
28
-
29
- for idx, word in enumerate(scales):
30
- numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0) # Handle "lac" as 10^5
31
-
32
- # Remove hyphens and normalize input
33
- textnum = textnum.replace('-', ' ')
34
-
35
- current = result = 0
36
- curstring = ''
37
- onnumber = False
38
- lastunit = False
39
- lastscale = False
40
-
41
- def is_numword(x):
42
- return is_number(x) or x in numwords
43
-
44
- def from_numword(x):
45
- if is_number(x):
46
- return 0, int(x.replace(',', ''))
47
- return numwords[x]
48
-
49
- for word in textnum.split():
50
- if word in ordinal_words:
51
- scale, increment = (1, ordinal_words[word])
52
- current = current * scale + increment
53
- if scale > 100:
54
- result += current
55
- current = 0
56
- onnumber = True
57
- lastunit = False
58
- lastscale = False
59
- else:
60
- for ending, replacement in ordinal_endings:
61
- if word.endswith(ending):
62
- word = f"{word[:-len(ending)]}{replacement}"
63
-
64
- if not is_numword(word) or (word == 'and' and not lastscale):
65
- if onnumber:
66
- curstring += repr(result + current) + " "
67
- curstring += word + " "
68
- result = current = 0
69
- onnumber = False
70
- lastunit = False
71
- lastscale = False
72
- else:
73
- scale, increment = from_numword(word)
74
- onnumber = True
75
-
76
- if lastunit and word not in scales:
77
- curstring += repr(result + current) + " "
78
- result = current = 0
79
-
80
- if scale > 1:
81
- current = max(1, current)
82
-
83
- current = current * scale + increment
84
-
85
- if scale >= 100:
86
- result += current
87
- current = 0
88
-
89
- lastscale = word in scales
90
- lastunit = word in units
91
-
92
- if onnumber:
93
- curstring += repr(result + current)
94
-
95
- return curstring.strip()
96
-
97
-
98
- # In[ ]:
99
-
100
-
101
-
102
-
 
1
+ from isNumber import is_number # Remove or replace this if unnecessary
2
+
3
+ def text_to_int(textnum, numwords={}):
4
+ # Define units, tens, and scales including "lac"
5
+ units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
6
+ 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
7
+ 'sixteen', 'seventeen', 'eighteen', 'nineteen']
8
+ tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
9
+ scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # "lac" added
10
+ ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
11
+ ordinal_endings = [('ieth', 'y'), ('th', '')]
12
+
13
+ if not numwords:
14
+ numwords['and'] = (1, 0) # Handle "one hundred and twenty"
15
+
16
+ # Add units, tens, and scales to numwords
17
+ for idx, word in enumerate(units):
18
+ numwords[word] = (1, idx)
19
+ for idx, word in enumerate(tens):
20
+ numwords[word] = (1, idx * 10)
21
+
22
+ for idx, word in enumerate(scales):
23
+ numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0) # Handle "lac" as 10^5
24
+
25
+ # Remove hyphens and normalize input
26
+ textnum = textnum.replace('-', ' ')
27
+
28
+ current = result = 0
29
+ curstring = ''
30
+ onnumber = False
31
+ lastunit = False
32
+ lastscale = False
33
+
34
+ def is_numword(x):
35
+ return is_number(x) or x in numwords
36
+
37
+ def from_numword(x):
38
+ if is_number(x):
39
+ return 0, int(x.replace(',', ''))
40
+ return numwords[x]
41
+
42
+ for word in textnum.split():
43
+ if word in ordinal_words:
44
+ scale, increment = (1, ordinal_words[word])
45
+ current = current * scale + increment
46
+ if scale > 100:
47
+ result += current
48
+ current = 0
49
+ onnumber = True
50
+ lastunit = False
51
+ lastscale = False
52
+ else:
53
+ for ending, replacement in ordinal_endings:
54
+ if word.endswith(ending):
55
+ word = f"{word[:-len(ending)]}{replacement}"
56
+
57
+ if not is_numword(word) or (word == 'and' and not lastscale):
58
+ if onnumber:
59
+ curstring += repr(result + current) + " "
60
+ curstring += word + " "
61
+ result = current = 0
62
+ onnumber = False
63
+ lastunit = False
64
+ lastscale = False
65
+ else:
66
+ scale, increment = from_numword(word)
67
+ onnumber = True
68
+
69
+ if lastunit and word not in scales:
70
+ curstring += repr(result + current) + " "
71
+ result = current = 0
72
+
73
+ if scale > 1:
74
+ current = max(1, current)
75
+
76
+ current = current * scale + increment
77
+
78
+ if scale >= 100:
79
+ result += current
80
+ current = 0
81
+
82
+ lastscale = word in scales
83
+ lastunit = word in units
84
+
85
+ if onnumber:
86
+ curstring += repr(result + current)
87
+
88
+ return curstring.strip()