|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import itertools
|
|
import re
|
|
|
|
|
|
def _num2chinese(num: str, big=False, simp=True, o=False, twoalt=False) -> str:
|
|
"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)
|
|
|
|
Args:
|
|
num (str): arabic number to convert
|
|
big (bool, optional): use financial characters. Defaults to False.
|
|
simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True.
|
|
o (bool, optional): use 〇 for 'zero'. Defaults to False.
|
|
twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False.
|
|
|
|
Raises:
|
|
ValueError: if number is more than 1e48
|
|
ValueError: if 'e' exposent in number
|
|
|
|
Returns:
|
|
str: converted number as hanzi characters
|
|
"""
|
|
|
|
|
|
nd = str(num)
|
|
if abs(float(nd)) >= 1e48:
|
|
raise ValueError("number out of range")
|
|
if "e" in nd:
|
|
raise ValueError("scientific notation is not supported")
|
|
c_symbol = "正负点" if simp else "正負點"
|
|
if o:
|
|
twoalt = False
|
|
if big:
|
|
c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
|
|
c_unit1 = "拾佰仟"
|
|
c_twoalt = "贰" if simp else "貳"
|
|
else:
|
|
c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
|
|
c_unit1 = "十百千"
|
|
if twoalt:
|
|
c_twoalt = "两" if simp else "兩"
|
|
else:
|
|
c_twoalt = "二"
|
|
c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"
|
|
revuniq = lambda l: "".join(k for k, g in itertools.groupby(reversed(l)))
|
|
nd = str(num)
|
|
result = []
|
|
if nd[0] == "+":
|
|
result.append(c_symbol[0])
|
|
elif nd[0] == "-":
|
|
result.append(c_symbol[1])
|
|
if "." in nd:
|
|
integer, remainder = nd.lstrip("+-").split(".")
|
|
else:
|
|
integer, remainder = nd.lstrip("+-"), None
|
|
if int(integer):
|
|
splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
|
|
intresult = []
|
|
for nu, unit in enumerate(splitted):
|
|
|
|
if int(unit) == 0:
|
|
intresult.append(c_basic[0])
|
|
continue
|
|
if nu > 0 and int(unit) == 2:
|
|
intresult.append(c_twoalt + c_unit2[nu - 1])
|
|
continue
|
|
ulist = []
|
|
unit = unit.zfill(4)
|
|
for nc, ch in enumerate(reversed(unit)):
|
|
if ch == "0":
|
|
if ulist:
|
|
ulist.append(c_basic[0])
|
|
elif nc == 0:
|
|
ulist.append(c_basic[int(ch)])
|
|
elif nc == 1 and ch == "1" and unit[1] == "0":
|
|
|
|
|
|
|
|
ulist.append(c_unit1[0])
|
|
elif nc > 1 and ch == "2":
|
|
ulist.append(c_twoalt + c_unit1[nc - 1])
|
|
else:
|
|
ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
|
|
ustr = revuniq(ulist)
|
|
if nu == 0:
|
|
intresult.append(ustr)
|
|
else:
|
|
intresult.append(ustr + c_unit2[nu - 1])
|
|
result.append(revuniq(intresult).strip(c_basic[0]))
|
|
else:
|
|
result.append(c_basic[0])
|
|
if remainder:
|
|
result.append(c_symbol[2])
|
|
result.append("".join(c_basic[int(ch)] for ch in remainder))
|
|
return "".join(result)
|
|
|
|
|
|
def _number_replace(match) -> str:
|
|
"""function to apply in a match, transform all numbers in a match by chinese characters
|
|
|
|
Args:
|
|
match (re.Match): numbers regex matches
|
|
|
|
Returns:
|
|
str: replaced characters for the numbers
|
|
"""
|
|
match_str: str = match.group()
|
|
return _num2chinese(match_str)
|
|
|
|
|
|
def replace_numbers_to_characters_in_text(text: str) -> str:
|
|
"""Replace all arabic numbers in a text by their equivalent in chinese characters (simplified)
|
|
|
|
Args:
|
|
text (str): input text to transform
|
|
|
|
Returns:
|
|
str: output text
|
|
"""
|
|
text = re.sub(r"[0-9]+", _number_replace, text)
|
|
return text
|
|
|