Spaces:
Sleeping
Sleeping
Upload 6 files
Browse files- Text2List.py +91 -0
- convert2list.py +55 -0
- isNumber.py +22 -0
- processDoubles.py +31 -0
- replaceWords.py +137 -0
- text2int.py +102 -0
Text2List.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[1]:
|
5 |
+
|
6 |
+
|
7 |
+
def text_to_list():
|
8 |
+
text_list=[
|
9 |
+
# Punjabi script for English numbers (11-19)
|
10 |
+
'ਏਲੈਵਨ', 'ਟਵੈਲਵ', 'ਥਰਟੀਨ', 'ਫੋਰਟੀਨ', 'ਫਿਫਟੀਨ', 'ਸਿਕਸਟੀਨ', 'ਸੈਵਨਟੀਨ', 'ਏਟੀਨ', 'ਨਾਈਨਟੀਨ',
|
11 |
+
|
12 |
+
# Punjabi numbers (11-19)
|
13 |
+
'ਗਿਆਰਹ', 'ਬਾਰਾਂ', 'ਤੇਹਰਾਂ', 'ਚੌਦਾਂਹ', 'ਪੰਦਰਾਂ', 'ਸੋਲਾਹ', 'ਸਤਾਰਾਂ', 'ਅਠਾਰਾਂ', 'ਉੱਨ੍ਹੀ',
|
14 |
+
|
15 |
+
# Punjabi script for English multiples of ten (20, 30, ..., 90)
|
16 |
+
'ਟਵੈਂਟੀ', 'ਥਰਟੀ', 'ਫੋਰਟੀ', 'ਫਿਫਟੀ', 'ਸਿਕਸਟੀ', 'ਸੇਵੰਟੀ', 'ਏਟੀ' , 'ਨਾਈਂਟੀ',
|
17 |
+
|
18 |
+
# Punjabi multiples of ten (20, 30, ..., 90)
|
19 |
+
'ਵੀਹ', 'ਤੀਹ', 'ਚਾਲੀ', 'ਪੰਜਾਹ', 'ਸੱਠ', 'ਸਤੱਰ', 'ਅੱਸੀ', 'ਨੱਬੇ',
|
20 |
+
|
21 |
+
# Punjabi script for English combinations of 21-29
|
22 |
+
'ਟਵੈਂਟੀ ਵਨ', 'ਟਵੈਂਟੀ ਟੂ', 'ਟਵੈਂਟੀ ਥ੍ਰੀ', 'ਟਵੈਂਟੀ ਫੋਰ', 'ਟਵੈਂਟੀ ਫਾਈਵ', 'ਟਵੈਂਟੀ ਸਿਕਸ', 'ਟਵੈਂਟੀ ਸੇਵਨ', 'ਟਵੈਂਟੀ ਏਟ', 'ਟਵੈਂਟੀ ਨਾਈਨ',
|
23 |
+
|
24 |
+
# Punjabi combinations of 21-29
|
25 |
+
'ਇੱਕੀ', 'ਬਾਈ', 'ਤੇਈ', 'ਚੌਵੀ', 'ਪੱਚੀ', 'ਛੱਬੀ', 'ਸਤਾਈ', 'ਅਠਾਈ', 'ਉਂਣਤੀ',
|
26 |
+
|
27 |
+
# Punjabi script for English combinations of 31-39
|
28 |
+
'ਥਰਟੀ ਵਨ', 'ਥਰਟੀ ਟੂ', 'ਥਰਟੀ ਥ੍ਰੀ', 'ਥਰਟੀ ਫੋਰ', 'ਥਰਟੀ ਫਾਈਵ', 'ਥਰਟੀ ਸਿਕਸ', 'ਥਰਟੀ ਸੇਵਨ', 'ਥਰਟੀ ਏਟ', 'ਥਰਟੀ ਨਾਈਨ',
|
29 |
+
|
30 |
+
# Punjabi combinations of 31-39
|
31 |
+
'ਇਕੱਤੀ', 'ਬੱਤੀ', 'ਤੇਂਤੀ', 'ਚੋਨਤੀ' , 'ਪੈਂਤੀ', 'ਛੱਤੀ', 'ਸੈਂਤੀ', 'ਅਠੱਤੀ' , 'ਉਨਤਾਲੀ',
|
32 |
+
|
33 |
+
# Punjabi script for English combinations of 41-49
|
34 |
+
'ਫੋਰਟੀ ਵਨ', 'ਫੋਰਟੀ ਟੂ', 'ਫੋਰਟੀ ਥ੍ਰੀ', 'ਫੋਰਟੀ ਫੋਰ', 'ਫੋਰਟੀ ਫਾਈਵ', 'ਫੋਰਟੀ ਸਿਕਸ', 'ਫੋਰਟੀ ਸੇਵਨ', 'ਫੋਰਟੀ ਏਟ', 'ਫੋਰਟੀ ਨਾਈਨ',
|
35 |
+
|
36 |
+
# Punjabi combinations of 41-49
|
37 |
+
'ਇਕਤਾਲੀ', 'ਬਿਆਲੀ', 'ਤਰਤਾਲੀ' , 'ਚੋਤਾਲੀ', 'ਪੰਤਾਲੀ', 'ਛਿਆਲੀ', 'ਸੰਤਾਲੀ', 'ਅੜਤਾਲੀ' , 'ਉਣੰਜਾ',
|
38 |
+
|
39 |
+
# Punjabi script for English combinations of 51-59
|
40 |
+
'ਫਿਫਟੀ ਵਨ', 'ਫਿਫਟੀ ਟੂ', 'ਫਿਫਟੀ ਥ੍ਰੀ', 'ਫਿਫਟੀ ਫੋਰ', 'ਫਿਫਟੀ ਫਾਈਵ', 'ਫਿਫਟੀ ਸਿਕਸ', 'ਫਿਫਟੀ ਸੇਵਨ', 'ਫਿਫਟੀ ਏਟ', 'ਫਿਫਟੀ ਨਾਈਨ',
|
41 |
+
|
42 |
+
# Punjabi combinations of 51-59
|
43 |
+
'ਅਕਵੰਜਾ', 'ਬਵੰਜਾ', 'ਤਰਵੰਜਾ', 'ਚੁਰੰਜਾ', 'ਪਚਵੰਜਾ' , 'ਛਪੰਜਾ' , 'ਸਤਵੰਜਾ' , 'ਅਠਵੰਜਾ' , 'ਉਣਹਾਟ' ,
|
44 |
+
|
45 |
+
# Punjabi script for English combinations of 61-69
|
46 |
+
'ਸਿਕਸਟੀ ਵਨ', 'ਸਿਕਸਟੀ ਟੂ', 'ਸਿਕਸਟੀ ਥ੍ਰੀ', 'ਸਿਕਸਟੀ ਫੋਰ', 'ਸਿਕਸਟੀ ਫਾਈਵ', 'ਸਿਕਸਟੀ ਸਿਕਸ', 'ਸਿਕਸਟੀ ਸੇਵਨ', 'ਸਿਕਸਟੀ ਏਟ', 'ਸਿਕਸਟੀ ਨਾਈਨ',
|
47 |
+
|
48 |
+
# Punjabi combinations of 61-69
|
49 |
+
'ਇਕਹਾਟ' , 'ਬਾਹਟ', 'ਤ੍ਰੇਹਟ' , 'ਚੋਹਟ', 'ਪਹਿਨਟ' , 'ਛੇਹਾਟ' , 'ਸਤਾਹਟ' , 'ਅਠਾਹਠ' , 'ਉਂਣਹਤਰ' ,
|
50 |
+
|
51 |
+
# Punjabi script for English combinations of 71-79
|
52 |
+
'ਸੇਵੰਟੀ ਵਨ', 'ਸੇਵੰਟੀ ਟੂ', 'ਸੇਵੰਟੀ ਥ੍ਰੀ', 'ਸੇਵੰਟੀ ਫੋਰ', 'ਸੇਵੰਟੀ ਫਾਈਵ', 'ਸੇਵੰਟੀ ਸਿਕਸ', 'ਸੇਵੰਟੀ ਸੇਵਨ', 'ਸੇਵੰਟੀ ਏਟ', 'ਸੇਵੰਟੀ ਨਾਈਨ',
|
53 |
+
|
54 |
+
# Punjabi combinations of 71-79
|
55 |
+
'ਇਕਹੱਤਰ' , 'ਬਹੱਤਰ', 'ਤਿਹੱਤਰ', 'ਚੌਹੱਤਰ', 'ਪਚਹੱਤਰ', 'ਛਿਹੱਤਰ', 'ਸੱਤਹੱਤਰ', 'ਅਠਹੱਤਰ', 'ਉਣਾਸੀ',
|
56 |
+
|
57 |
+
# Punjabi script for English combinations of 81-89
|
58 |
+
'ਏਟੀ ਵਨ', 'ਏਟੀ ਟੂ', 'ਏਟੀ ਥ੍ਰੀ', 'ਏਟੀ ਫੋਰ', 'ਏਟੀ ਫਾਈਵ', 'ਏਟੀ ਸਿਕਸ', 'ਏਟੀ ਸੇਵਨ', 'ਏਟੀ ਏਟ', 'ਏਟੀ ਨਾਈਨ',
|
59 |
+
|
60 |
+
# Punjabi combinations of 81-89
|
61 |
+
'ਇੱਕਿਆਸੀ', 'ਬਿਆਸੀ', 'ਤਰਾਸੀ', 'ਚੌਰਾਸੀ', 'ਪਚਾਸੀ', 'ਛਿਆਸੀ', 'ਸਤਾਸੀ', 'ਅਠਾਸੀ', 'ਉਣੰਨਵੇਂ' ,
|
62 |
+
|
63 |
+
# Punjabi script for English combinations of 91-99
|
64 |
+
'ਨਾਈਂਟੀ ਵਨ', 'ਨਾਈਂਟੀ ਟੂ', 'ਨਾਈਂਟੀ ਥ੍ਰੀ', 'ਨਾਈਂਟੀ ਫੋਰ', 'ਨਾਈਂਟੀ ਫਾਈਵ', 'ਨਾਈਂਟੀ ਸਿਕਸ', 'ਨਾਈਂਟੀ ਸੇਵਨ', 'ਨਾਈਂਟੀ ਏਟ', 'ਨਾਈਂਟੀ ਨਾਈਨ',
|
65 |
+
|
66 |
+
# Punjabi combinations of 91-99
|
67 |
+
'ਇੱਕਿਆਨਵੇ', 'ਬਨਵੇਂ' , 'ਤੇਰਾਨਵੇਂ' , 'ਚੌਰਨਵੇ', 'ਪਚੰਨਵੇਂ' , 'ਛਿਆਨਵੇ' , 'ਸਤੰਨਵੇ' , 'ਅਠੰਨਵੇ' , 'ਨੜ੍ਹੀਨਵੇਂ',
|
68 |
+
|
69 |
+
# Punjabi script for English numbers (0-10)
|
70 |
+
'ਜ਼ੀਰੋ', 'ਵਨ', 'ਟੂ', 'ਥ੍ਰੀ', 'ਫੋਰ', 'ਫਾਈਵ', 'ਸਿਕਸ', 'ਸੇਵਨ', 'ਏਟ', 'ਨਾਈਨ', 'ਟੈਨ',
|
71 |
+
|
72 |
+
# Punjabi numbers (0-10)
|
73 |
+
'ਸਿਫ਼ਰ', 'ਇੱਕ', 'ਦੋ', 'ਤਿੰਨ', 'ਚਾਰ', 'ਪੰਜ', 'ਛੇ', 'ਸੱਤ', 'ਅੱਠ', 'ਨੌ', 'ਦਸ',
|
74 |
+
|
75 |
+
# Punjabi script for 100
|
76 |
+
'ਹੰਡਰਡ',
|
77 |
+
|
78 |
+
# Punjabi for 100
|
79 |
+
'ਸੌ',
|
80 |
+
|
81 |
+
# Punjabi for 1000
|
82 |
+
'ਹਜ਼ਾਰ',
|
83 |
+
]
|
84 |
+
return text_list
|
85 |
+
|
86 |
+
|
87 |
+
# In[ ]:
|
88 |
+
|
89 |
+
|
90 |
+
|
91 |
+
|
convert2list.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[2]:
|
5 |
+
|
6 |
+
|
7 |
+
# import nbimporter
|
8 |
+
import nbimporter
|
9 |
+
from Text2List import text_to_list
|
10 |
+
def convert_to_list(text, text_list):
|
11 |
+
matched_words = []
|
12 |
+
unmatched_text = '' # To accumulate unmatched characters
|
13 |
+
|
14 |
+
# Sort text_list by length in descending order to prioritize longest matches first
|
15 |
+
text_list_sorted = sorted(text_list, key=len, reverse=True)
|
16 |
+
|
17 |
+
while text:
|
18 |
+
matched = False
|
19 |
+
for word in text_list_sorted:
|
20 |
+
if text.startswith(word):
|
21 |
+
# Add any accumulated unmatched text before appending the matched word
|
22 |
+
if unmatched_text:
|
23 |
+
matched_words.append(unmatched_text)
|
24 |
+
unmatched_text = '' # Reset unmatched text accumulator
|
25 |
+
|
26 |
+
matched_words.append(word)
|
27 |
+
text = text[len(word):] # Remove the matched part from text
|
28 |
+
matched = True
|
29 |
+
break
|
30 |
+
|
31 |
+
if not matched:
|
32 |
+
# Accumulate unmatched characters
|
33 |
+
unmatched_text += text[0]
|
34 |
+
text = text[1:]
|
35 |
+
|
36 |
+
# If there's any remaining unmatched text, add it to the result
|
37 |
+
if unmatched_text:
|
38 |
+
matched_words.append(unmatched_text)
|
39 |
+
|
40 |
+
# Join matched words and unmatched text with a space
|
41 |
+
result = ' '.join(matched_words)
|
42 |
+
return result
|
43 |
+
|
44 |
+
# text = "जीरोएकदोतीनचारपांचछहसातआठनौदसजीरोएकदोतीनचारपांच"
|
45 |
+
|
46 |
+
# if __name__=="__main__":
|
47 |
+
# converted=convert_to_list(text, text_to_list())
|
48 |
+
# print(converted)
|
49 |
+
|
50 |
+
|
51 |
+
# In[ ]:
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
isNumber.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[1]:
|
5 |
+
|
6 |
+
|
7 |
+
# Function to check if the string is a number
|
8 |
+
def is_number(x):
|
9 |
+
if type(x) == str:
|
10 |
+
x = x.replace(',', '')
|
11 |
+
try:
|
12 |
+
float(x)
|
13 |
+
except:
|
14 |
+
return False
|
15 |
+
return True
|
16 |
+
|
17 |
+
|
18 |
+
# In[ ]:
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
|
processDoubles.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[ ]:
|
5 |
+
|
6 |
+
|
7 |
+
import re
|
8 |
+
|
9 |
+
def process_doubles(sentence):
|
10 |
+
# Use regex to split 'डबल' followed by numbers/words without space (e.g., "डबलवन" -> "डबल वन")
|
11 |
+
sentence = re.sub(r'(ਡਬਲ)(\S+)', r'\1 \2', sentence)
|
12 |
+
|
13 |
+
tokens = sentence.split()
|
14 |
+
result = []
|
15 |
+
i = 0
|
16 |
+
|
17 |
+
while i < len(tokens):
|
18 |
+
if tokens[i] == "ਡਬਲ":
|
19 |
+
if i + 1 < len(tokens):
|
20 |
+
result.append(tokens[i + 1]) # Append the next word/number
|
21 |
+
result.append(tokens[i + 1]) # Append the next word/number again to duplicate
|
22 |
+
i += 2 # Skip over the next word since it's already processed
|
23 |
+
else:
|
24 |
+
result.append(tokens[i])
|
25 |
+
i += 1
|
26 |
+
else:
|
27 |
+
result.append(tokens[i])
|
28 |
+
i += 1
|
29 |
+
|
30 |
+
return ' '.join(result)
|
31 |
+
|
replaceWords.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[9]:
|
5 |
+
|
6 |
+
|
7 |
+
import re
|
8 |
+
def replace_words(sentence):
|
9 |
+
replacement_map = {
|
10 |
+
'one': ['ਵਨ', 'ਇੱਕ'],
|
11 |
+
'two': ['ਟੂ', 'ਦੋ'],
|
12 |
+
'three': ['ਥ੍ਰੀ', 'ਤਿੰਨ'],
|
13 |
+
'four': ['ਫੋਰ', 'ਚਾਰ'],
|
14 |
+
'five': ['ਫਾਈਵ', 'ਪੰਜ'],
|
15 |
+
'six': ['ਸਿਕਸ', 'ਛੇ',],
|
16 |
+
'seven': ['ਸੇਵਨ', 'ਸੱਤ'],
|
17 |
+
'eight': ['ਏਟ', 'ਅੱਠ'],
|
18 |
+
'nine': ['ਨਾਈਨ', 'ਨੌ'],
|
19 |
+
'ten': ['ਟੈਨ', 'ਦਸ', ],
|
20 |
+
|
21 |
+
'eleven': ['ਇਲੇਵਨ', 'ਗਿਆਰਹ'],
|
22 |
+
'twelve': ['ਟਵੈਲਵ', 'ਬਾਰਾਂ'],
|
23 |
+
'thirteen': ['ਥਰਟੀਨ', 'ਤੇਹਰਾਂ'],
|
24 |
+
'fourteen': ['ਫੋਰਟੀਨ', 'ਚੌਦਾਂਹ'],
|
25 |
+
'fifteen': ['ਫਿਫਟੀਨ', 'ਪੰਦਰਾਂ'],
|
26 |
+
'sixteen': ['ਸਿਕਸਟਿਨ', 'ਸੋਲਾਹ'],
|
27 |
+
'seventeen': ['ਸੈਵਨਟੀਨ', 'ਸਤਾਰਾਂ'],
|
28 |
+
'eighteen': ['ਏਟੀਨ', 'ਅਠਾਰਾਂ'],
|
29 |
+
'nineteen': ['ਨਾਈਨਟੀਨ', 'ਉੱਨ੍ਹੀ'],
|
30 |
+
|
31 |
+
'twenty': ['ਟਵੈਂਟੀ', 'ਵੀਹ'],
|
32 |
+
'twenty one': ['ਟਵੈਂਟੀ ਵਨ', 'ਇੱਕੀ'],
|
33 |
+
'twenty two': ['ਟਵੈਂਟੀ ਟੂ', 'ਬਾਈ'],
|
34 |
+
'twenty three': ['ਟਵੈਂਟੀ ਥ੍ਰੀ', 'ਤੇਈ'],
|
35 |
+
'twenty four': ['ਟਵੈਂਟੀ ਫੋਰ', 'ਚੋਵੀ'],
|
36 |
+
'twenty five': ['ਟਵੈਂਟੀ ਫਾਈਵ', 'ਪੱਚੀ'],
|
37 |
+
'twenty six': ['ਟਵੈਂਟੀ ਸਿਕਸ', 'ਛੱਬੀ'],
|
38 |
+
'twenty seven': ['ਟਵੈਂਟੀ ਸੇਵਨ', 'ਸਤਾਈ'],
|
39 |
+
'twenty eight': ['ਟਵੈਂਟੀ ਏਟ', 'ਅਠਾਈ'],
|
40 |
+
'twenty nine': ['ਟਵੈਂਟੀ ਨਾਈਨ', 'ਉਂਣਤੀ'],
|
41 |
+
|
42 |
+
'thirty': ['ਥਰਟੀ', 'ਤੀਹ'],
|
43 |
+
'thirty one': ['ਥਰਟੀ ਵਨ', 'ਇਕੱਤੀ'],
|
44 |
+
'thirty two': ['ਥਰਟੀ ਟੂ', 'ਬੱਤੀ'],
|
45 |
+
'thirty three': ['ਥਰਟੀ ਥ੍ਰੀ', 'ਤੇਤੀ'],
|
46 |
+
'thirty four': ['ਥਰਟੀ ਫੋਰ', 'ਚੋਨਤੀ'],
|
47 |
+
'thirty five': ['ਥਰਟੀ ਫਾਈਵ', 'ਪੈਂਤੀ'],
|
48 |
+
'thirty six': ['ਥਰਟੀ ਸਿਕਸ', 'ਛੱਤੀ'],
|
49 |
+
'thirty seven': ['ਥਰਟੀ ਸੇਵਨ', 'ਸੈਂਤੀ'],
|
50 |
+
'thirty eight': ['ਥਰਟੀ ਏਟ', 'ਅਠੱਤੀ'],
|
51 |
+
'thirty nine': ['ਥਰਟੀ ਨਾਈਨ', 'ਉਨਤਾਲੀ'],
|
52 |
+
|
53 |
+
'forty': ['ਫੋਰਟੀ', 'ਚਾਲੀ'],
|
54 |
+
'forty one': ['ਫੋਰਟੀ ਵਨ', 'ਇਕਤਾਲੀ'],
|
55 |
+
'forty two': ['ਫੋਰਟੀ ਟੂ', 'ਬਿਆਲੀ'],
|
56 |
+
'forty three': ['ਫੋਰਟੀ ਥ੍ਰੀ', 'ਤਰਤਾਲੀ'],
|
57 |
+
'forty four': ['ਫੋਰਟੀ ਫੋਰ', 'ਚੋਤਾਲੀ'],
|
58 |
+
'forty five': ['ਫੋਰਟੀ ਫਾਈਵ', 'ਪੰਤਾਲੀ'],
|
59 |
+
'forty six': ['ਫੋਰਟੀ ਸਿਕਸ', 'ਛਿਆਲੀ'],
|
60 |
+
'forty seven': ['ਫੋਰਟੀ ਸੇਵਨ', 'ਸੈਂਤਾਲੀ'],
|
61 |
+
'forty eight': ['ਫੋਰਟੀ ਏਟ', 'ਅੜਤਾਲੀ'],
|
62 |
+
'forty nine': ['ਫੋਰਟੀ ਨਾਈਨ', 'ਉਣੰਜਾ'],
|
63 |
+
|
64 |
+
'fifty': ['ਫਿਫਟੀ', 'ਪੰਜਾਹ'],
|
65 |
+
'fifty one': ['ਫਿਫਟੀ ਵਨ', 'ਅਕਵੰਜਾ'],
|
66 |
+
'fifty two': ['ਫਿਫਟੀ ਟੂ', 'ਬਵੰਜਾ'],
|
67 |
+
'fifty three': ['ਫਿਫਟੀ ਥ੍ਰੀ', 'ਤਰਵੰਜਾ'],
|
68 |
+
'fifty four': ['ਫਿਫਟੀ ਫੋਰ', 'ਚੁਰੰਜਾ'],
|
69 |
+
'fifty five': ['ਫਿਫਟੀ ਫਾਈਵ', 'ਪਚਵੰਜਾ'],
|
70 |
+
'fifty six': ['ਫਿਫਟੀ ਸਿਕਸ', 'ਛਪੰਜਾ'],
|
71 |
+
'fifty seven': ['ਫਿਫਟੀ ਸੇਵਨ', 'ਸਤਵੰਜਾ'],
|
72 |
+
'fifty eight': ['ਫਿਫਟੀ ਏਟ', 'ਅਠਵੰਜਾ'],
|
73 |
+
'fifty nine': ['ਫਿਫਟੀ ਨਾਈਨ', 'ਉਣਹਾਟ'],
|
74 |
+
|
75 |
+
'sixty': ['ਸਿਕਸਟੀ', 'ਸੱਠ'],
|
76 |
+
'sixty one': ['ਸਿਕਸਟੀ ਵਨ', 'ਇਕਹਾਟ'],
|
77 |
+
'sixty two': ['ਸਿਕਸਟੀ ਟੂ', 'ਬਾਹਟ'],
|
78 |
+
'sixty three': ['ਸਿਕਸਟੀ ਥ੍ਰੀ', 'ਤ੍ਰੇਹਟ'],
|
79 |
+
'sixty four': ['ਸਿਕਸਟੀ ਫੋਰ', 'ਚੋਹਟ'],
|
80 |
+
'sixty five': ['ਸਿਕਸਟੀ ਫਾਈਵ', 'ਪਹਿਨਟ'],
|
81 |
+
'sixty six': ['ਸਿਕਸਟੀ ਸਿਕਸ', 'ਛੇਹਾਟ'],
|
82 |
+
'sixty seven': ['ਸਿਕਸਟੀ ਸੇਵਨ', 'ਸਤਾਹਟ'],
|
83 |
+
'sixty eight': ['ਸਿਕਸਟੀ ਏਟ', 'ਅਠਾਹਠ'],
|
84 |
+
'sixty nine': ['ਸਿਕਸਟੀ ਨਾਈਨ', 'ਉਂਣਹਤਰ'],
|
85 |
+
|
86 |
+
'seventy': ['ਸੇਵੰਟੀ', 'ਸੱਤਰ'],
|
87 |
+
'seventy one': ['ਸੇਵੰਟੀ ਵਨ', 'ਇਕਹੱਤਰ'],
|
88 |
+
'seventy two': ['ਸੇਵੰਟੀ ਟੂ', 'ਬਹੱਤਰ'],
|
89 |
+
'seventy three': ['ਸੇਵੰਟੀ ਥ੍ਰੀ', 'ਤਿਹੱਤਰ'],
|
90 |
+
'seventy four': ['ਸੇਵੰਟੀ ਫੋਰ', 'ਚੌਹੱਤਰ'],
|
91 |
+
'seventy five': ['ਸੇਵੰਟੀ ਫਾਈਵ', 'ਪਚਹੱਤਰ'],
|
92 |
+
'seventy six': ['ਸੇਵੰਟੀ ਸਿਕਸ', 'ਛਿਹੱਤਰ'],
|
93 |
+
'seventy seven': ['ਸੇਵੰਟੀ ਸੇਵਨ', 'ਸਤਹੱਤਰ'],
|
94 |
+
'seventy eight': ['ਸੇਵੰਟੀ ਏਟ', 'ਅਠਹੱਤਰ'],
|
95 |
+
'seventy nine': ['ਸੇਵੰਟੀ ਨਾਈਨ', 'ਉਣਾਸੀ'],
|
96 |
+
|
97 |
+
'eighty': ['ਏਟੀ', 'ਅਸੀ','ਅੱਸੀ'],
|
98 |
+
'eighty one': ['ਏਟੀ ਵਨ', 'ਇੱਕਿਆਸੀ'],
|
99 |
+
'eighty two': ['ਏਟੀ ਟੂ', 'ਬਿਆਸੀ'],
|
100 |
+
'eighty three': ['ਏਟੀ ਥ੍ਰੀ', 'ਤਿਰਾਸੀ'],
|
101 |
+
'eighty four': ['ਏਟੀ ਫੋਰ', 'ਚੌਰਾਸੀ'],
|
102 |
+
'eighty five': ['ਏਟੀ ਫਾਈਵ', 'ਪਚਾਸੀ'],
|
103 |
+
'eighty six': ['ਏਟੀ ਸਿਕਸ', 'ਛਿਆਸੀ'],
|
104 |
+
'eighty seven': ['ਏਟੀ ਸੇਵਨ', 'ਸਤਾਸੀ'],
|
105 |
+
'eighty eight': ['ਏਟੀ ਏਟ', 'ਅਠਾਸੀ'],
|
106 |
+
'eighty nine': ['ਏਟੀ ਨਾਈਨ', 'ਨਵਾਸੀ'],
|
107 |
+
|
108 |
+
'ninety': ['ਨਾਇੰਟੀ', 'ਨੱਬੇ'],
|
109 |
+
'ninety one': ['ਨਾਇੰਟੀ ਵਨ', 'ਇੱਕਿਆਨਵੇ'],
|
110 |
+
'ninety two': ['ਨਾਇੰਟੀ ਟੂ', 'ਬਨਵੇਂ'],
|
111 |
+
'ninety three': ['ਨਾਇੰਟੀ ਥ੍ਰੀ', 'ਤੇਰਾਨਵੇਂ'],
|
112 |
+
'ninety four': ['ਨਾਇੰਟੀ ਫੋਰ', 'ਚੌਰਾਨਵੇ'],
|
113 |
+
'ninety five': ['ਨਾਇੰਟੀ ਫਾਈਵ', 'ਪਚੰਨਵੇਂ'],
|
114 |
+
'ninety six': ['ਨਾਇੰਟੀ ਸਿਕਸ', 'ਛਿਆਨਵੇ'],
|
115 |
+
'ninety seven': ['ਨਾਇੰਟੀ ਸੇਵਨ', 'ਸਤੰਨਵੇ'],
|
116 |
+
'ninety eight': ['ਨਾਇੰਟੀ ਏਟ', 'ਅਠੰਨਵੇ'],
|
117 |
+
'ninety nine': ['ਨਾਇੰਟੀ ਨਾਈਨ', 'ਨੜ੍ਹੀਨਵੇਂ'],
|
118 |
+
|
119 |
+
'hundred': ['ਹੰਡਰਡ', 'ਸੌ']
|
120 |
+
}
|
121 |
+
words = sentence.split() # Split the sentence by spaces
|
122 |
+
|
123 |
+
# Replace words using the mapping
|
124 |
+
for i, word in enumerate(words):
|
125 |
+
for replacement, patterns in replacement_map.items():
|
126 |
+
if word in patterns:
|
127 |
+
words[i] = replacement # Replace the word if it's fully matched
|
128 |
+
|
129 |
+
# Join the processed words back into a sentence
|
130 |
+
return ' '.join(words)
|
131 |
+
|
132 |
+
|
133 |
+
# In[ ]:
|
134 |
+
|
135 |
+
|
136 |
+
|
137 |
+
|
text2int.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[3]:
|
5 |
+
|
6 |
+
|
7 |
+
import nbimporter
|
8 |
+
from isNumber import is_number # Remove or replace this if unnecessary
|
9 |
+
|
10 |
+
def text_to_int(textnum, numwords={}):
|
11 |
+
# Define units, tens, and scales including "lac"
|
12 |
+
units = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
|
13 |
+
'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen',
|
14 |
+
'sixteen', 'seventeen', 'eighteen', 'nineteen']
|
15 |
+
tens = ['', '', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety']
|
16 |
+
scales = ['hundred', 'thousand', 'lac', 'million', 'billion', 'trillion'] # "lac" added
|
17 |
+
ordinal_words = {'first': 1, 'second': 2, 'third': 3, 'fifth': 5, 'eighth': 8, 'ninth': 9, 'twelfth': 12}
|
18 |
+
ordinal_endings = [('ieth', 'y'), ('th', '')]
|
19 |
+
|
20 |
+
if not numwords:
|
21 |
+
numwords['and'] = (1, 0) # Handle "one hundred and twenty"
|
22 |
+
|
23 |
+
# Add units, tens, and scales to numwords
|
24 |
+
for idx, word in enumerate(units):
|
25 |
+
numwords[word] = (1, idx)
|
26 |
+
for idx, word in enumerate(tens):
|
27 |
+
numwords[word] = (1, idx * 10)
|
28 |
+
|
29 |
+
for idx, word in enumerate(scales):
|
30 |
+
numwords[word] = (10 ** (5 if word == 'lac' else idx * 3 or 2), 0) # Handle "lac" as 10^5
|
31 |
+
|
32 |
+
# Remove hyphens and normalize input
|
33 |
+
textnum = textnum.replace('-', ' ')
|
34 |
+
|
35 |
+
current = result = 0
|
36 |
+
curstring = ''
|
37 |
+
onnumber = False
|
38 |
+
lastunit = False
|
39 |
+
lastscale = False
|
40 |
+
|
41 |
+
def is_numword(x):
|
42 |
+
return is_number(x) or x in numwords
|
43 |
+
|
44 |
+
def from_numword(x):
|
45 |
+
if is_number(x):
|
46 |
+
return 0, int(x.replace(',', ''))
|
47 |
+
return numwords[x]
|
48 |
+
|
49 |
+
for word in textnum.split():
|
50 |
+
if word in ordinal_words:
|
51 |
+
scale, increment = (1, ordinal_words[word])
|
52 |
+
current = current * scale + increment
|
53 |
+
if scale > 100:
|
54 |
+
result += current
|
55 |
+
current = 0
|
56 |
+
onnumber = True
|
57 |
+
lastunit = False
|
58 |
+
lastscale = False
|
59 |
+
else:
|
60 |
+
for ending, replacement in ordinal_endings:
|
61 |
+
if word.endswith(ending):
|
62 |
+
word = f"{word[:-len(ending)]}{replacement}"
|
63 |
+
|
64 |
+
if not is_numword(word) or (word == 'and' and not lastscale):
|
65 |
+
if onnumber:
|
66 |
+
curstring += repr(result + current) + " "
|
67 |
+
curstring += word + " "
|
68 |
+
result = current = 0
|
69 |
+
onnumber = False
|
70 |
+
lastunit = False
|
71 |
+
lastscale = False
|
72 |
+
else:
|
73 |
+
scale, increment = from_numword(word)
|
74 |
+
onnumber = True
|
75 |
+
|
76 |
+
if lastunit and word not in scales:
|
77 |
+
curstring += repr(result + current) + " "
|
78 |
+
result = current = 0
|
79 |
+
|
80 |
+
if scale > 1:
|
81 |
+
current = max(1, current)
|
82 |
+
|
83 |
+
current = current * scale + increment
|
84 |
+
|
85 |
+
if scale >= 100:
|
86 |
+
result += current
|
87 |
+
current = 0
|
88 |
+
|
89 |
+
lastscale = word in scales
|
90 |
+
lastunit = word in units
|
91 |
+
|
92 |
+
if onnumber:
|
93 |
+
curstring += repr(result + current)
|
94 |
+
|
95 |
+
return curstring.strip()
|
96 |
+
|
97 |
+
|
98 |
+
# In[ ]:
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
|