cdactvm commited on
Commit
0655b26
·
verified ·
1 Parent(s): c298401

Update text2int.py

Browse files
Files changed (1) hide show
  1. text2int.py +89 -200
text2int.py CHANGED
@@ -1,200 +1,89 @@
1
- #!/usr/bin/env python
2
- # coding: utf-8
3
-
4
- # In[ ]:
5
-
6
-
7
- # # Function to convert Hindi text to numerical representation
8
- # from isNumber import is_number
9
-
10
- # def text_to_int (textnum, numwords={}):
11
- # units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
12
- # 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
13
- # 'sixteen', 'seventeen', 'eighteen', 'nineteen']
14
- # tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
15
- # scales = ['hundred', 'thousand', 'lac','million', 'billion', 'trillion']
16
- # ordinal_words = {'first':1, 'second':2, 'third':3, 'fifth':5, 'eighth':8, 'ninth':9, 'twelfth':12}
17
- # ordinal_endings = [('ieth', 'y'), ('th', '')]
18
-
19
- # if not numwords:
20
- # numwords['and'] = (1, 0)
21
- # for idx, word in enumerate(units): numwords[word] = (1, idx)
22
- # for idx, word in enumerate(tens): numwords[word] = (1, idx * 10)
23
- # for idx, word in enumerate(scales): numwords[word] = (10 ** (idx * 3 or 2), 0)
24
-
25
- # textnum = textnum.replace('-', ' ')
26
-
27
- # current = result = 0
28
- # curstring = ''
29
- # onnumber = False
30
- # lastunit = False
31
- # lastscale = False
32
-
33
- # def is_numword(x):
34
- # if is_number(x):
35
- # return True
36
- # if word in numwords:
37
- # return True
38
- # return False
39
-
40
- # def from_numword(x):
41
- # if is_number(x):
42
- # scale = 0
43
- # increment = int(x.replace(',', ''))
44
- # return scale, increment
45
- # return numwords[x]
46
-
47
- # for word in textnum.split():
48
- # if word in ordinal_words:
49
- # scale, increment = (1, ordinal_words[word])
50
- # current = current * scale + increment
51
- # if scale > 100:
52
- # result += current
53
- # current = 0
54
- # onnumber = True
55
- # lastunit = False
56
- # lastscale = False
57
- # else:
58
- # for ending, replacement in ordinal_endings:
59
- # if word.endswith(ending):
60
- # word = "%s%s" % (word[:-len(ending)], replacement)
61
-
62
- # if (not is_numword(word)) or (word == 'and' and not lastscale):
63
- # if onnumber:
64
- # # Flush the current number we are building
65
- # curstring += repr(result + current) + " "
66
- # curstring += word + " "
67
- # result = current = 0
68
- # onnumber = False
69
- # lastunit = False
70
- # lastscale = False
71
- # else:
72
- # scale, increment = from_numword(word)
73
- # onnumber = True
74
-
75
- # if lastunit and (word not in scales):
76
- # # Assume this is part of a string of individual numbers to
77
- # # be flushed, such as a zipcode "one two three four five"
78
- # curstring += repr(result + current)
79
- # result = current = 0
80
-
81
- # if scale > 1:
82
- # current = max(1, current)
83
-
84
- # current = current * scale + increment
85
- # if scale > 100:
86
- # result += current
87
- # current = 0
88
-
89
- # lastscale = False
90
- # lastunit = False
91
- # if word in scales:
92
- # lastscale = True
93
- # elif word in units:
94
- # lastunit = True
95
-
96
- # if onnumber:
97
- # curstring += repr(result + current)
98
-
99
- # return curstring
100
-
101
-
102
- # In[3]:
103
-
104
-
105
- import nbimporter
106
- from isNumber import is_number # Remove or replace this if unnecessary
107
-
108
- def text_to_int(textnum, numwords={}):
109
- # Define units, tens, and scales including "lac"
110
- units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
111
- 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
112
- 'sixteen', 'seventeen', 'eighteen', 'nineteen']
113
- tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
114
- scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # "lac" added
115
- ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
116
- ordinal_endings = [('ieth', 'y'), ('th', '')]
117
-
118
- if not numwords:
119
- numwords['and'] = (1, 0) # Handle "one hundred and twenty"
120
-
121
- # Add units, tens, and scales to numwords
122
- for idx, word in enumerate(units):
123
- numwords[word] = (1, idx)
124
- for idx, word in enumerate(tens):
125
- numwords[word] = (1, idx * 10)
126
-
127
- for idx, word in enumerate(scales):
128
- numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0) # Handle "lac" as 10^5
129
-
130
- # Remove hyphens and normalize input
131
- textnum = textnum.replace('-', ' ')
132
-
133
- current = result = 0
134
- curstring = ''
135
- onnumber = False
136
- lastunit = False
137
- lastscale = False
138
-
139
- def is_numword(x):
140
- return is_number(x) or x in numwords
141
-
142
- def from_numword(x):
143
- if is_number(x):
144
- return 0, int(x.replace(',', ''))
145
- return numwords[x]
146
-
147
- for word in textnum.split():
148
- if word in ordinal_words:
149
- scale, increment = (1, ordinal_words[word])
150
- current = current * scale + increment
151
- if scale > 100:
152
- result += current
153
- current = 0
154
- onnumber = True
155
- lastunit = False
156
- lastscale = False
157
- else:
158
- for ending, replacement in ordinal_endings:
159
- if word.endswith(ending):
160
- word = f"{word[:-len(ending)]}{replacement}"
161
-
162
- if not is_numword(word) or (word == 'and' and not lastscale):
163
- if onnumber:
164
- curstring += repr(result + current) + " "
165
- curstring += word + " "
166
- result = current = 0
167
- onnumber = False
168
- lastunit = False
169
- lastscale = False
170
- else:
171
- scale, increment = from_numword(word)
172
- onnumber = True
173
-
174
- if lastunit and word not in scales:
175
- curstring += repr(result + current) + " "
176
- result = current = 0
177
-
178
- if scale > 1:
179
- current = max(1, current)
180
-
181
- current = current * scale + increment
182
-
183
- if scale >= 100:
184
- result += current
185
- current = 0
186
-
187
- lastscale = word in scales
188
- lastunit = word in units
189
-
190
- if onnumber:
191
- curstring += repr(result + current)
192
-
193
- return curstring.strip()
194
-
195
-
196
- # In[ ]:
197
-
198
-
199
-
200
-
 
1
+ import nbimporter
2
+ from isNumber import is_number # Remove or replace this if unnecessary
3
+
4
+ def text_to_int(textnum, numwords={}):
5
+ # Define units, tens, and scales including "lac"
6
+ units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
7
+ 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
8
+ 'sixteen', 'seventeen', 'eighteen', 'nineteen']
9
+ tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
10
+ scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # "lac" added
11
+ ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
12
+ ordinal_endings = [('ieth', 'y'), ('th', '')]
13
+
14
+ if not numwords:
15
+ numwords['and'] = (1, 0) # Handle "one hundred and twenty"
16
+
17
+ # Add units, tens, and scales to numwords
18
+ for idx, word in enumerate(units):
19
+ numwords[word] = (1, idx)
20
+ for idx, word in enumerate(tens):
21
+ numwords[word] = (1, idx * 10)
22
+
23
+ for idx, word in enumerate(scales):
24
+ numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0) # Handle "lac" as 10^5
25
+
26
+ # Remove hyphens and normalize input
27
+ textnum = textnum.replace('-', ' ')
28
+
29
+ current = result = 0
30
+ curstring = ''
31
+ onnumber = False
32
+ lastunit = False
33
+ lastscale = False
34
+
35
+ def is_numword(x):
36
+ return is_number(x) or x in numwords
37
+
38
+ def from_numword(x):
39
+ if is_number(x):
40
+ return 0, int(x.replace(',', ''))
41
+ return numwords[x]
42
+
43
+ for word in textnum.split():
44
+ if word in ordinal_words:
45
+ scale, increment = (1, ordinal_words[word])
46
+ current = current * scale + increment
47
+ if scale > 100:
48
+ result += current
49
+ current = 0
50
+ onnumber = True
51
+ lastunit = False
52
+ lastscale = False
53
+ else:
54
+ for ending, replacement in ordinal_endings:
55
+ if word.endswith(ending):
56
+ word = f"{word[:-len(ending)]}{replacement}"
57
+
58
+ if not is_numword(word) or (word == 'and' and not lastscale):
59
+ if onnumber:
60
+ curstring += repr(result + current) + " "
61
+ curstring += word + " "
62
+ result = current = 0
63
+ onnumber = False
64
+ lastunit = False
65
+ lastscale = False
66
+ else:
67
+ scale, increment = from_numword(word)
68
+ onnumber = True
69
+
70
+ if lastunit and word not in scales:
71
+ curstring += repr(result + current) + " "
72
+ result = current = 0
73
+
74
+ if scale > 1:
75
+ current = max(1, current)
76
+
77
+ current = current * scale + increment
78
+
79
+ if scale >= 100:
80
+ result += current
81
+ current = 0
82
+
83
+ lastscale = word in scales
84
+ lastunit = word in units
85
+
86
+ if onnumber:
87
+ curstring += repr(result + current)
88
+
89
+ return curstring.strip()