coqui2

Sleeping

App Files Files Community

coqui2 / TTS /tts /utils /text /chinese_mandarin /numbers.py

Adoetz

Upload 833 files

17ed7d8 verified 2 months ago

raw

history blame contribute delete

4.58 kB

	#!/usr/bin/env python3
	# -- coding: utf-8 --

	# Licensed under WTFPL or the Unlicense or CC0.
	# This uses Python 3, but it's easy to port to Python 2 by changing
	# strings to u'xx'.

	import itertools
	import re


	def _num2chinese(num: str, big=False, simp=True, o=False, twoalt=False) -> str:
	"""Convert numerical arabic numbers (0->9) to chinese hanzi numbers (〇 -> 九)

	Args:
	num (str): arabic number to convert
	big (bool, optional): use financial characters. Defaults to False.
	simp (bool, optional): use simplified characters instead of tradictional characters. Defaults to True.
	o (bool, optional): use 〇 for 'zero'. Defaults to False.
	twoalt (bool, optional): use 两/兩 for 'two' when appropriate. Defaults to False.

	Raises:
	ValueError: if number is more than 1e48
	ValueError: if 'e' exposent in number

	Returns:
	str: converted number as hanzi characters
	"""

	# check num first
	nd = str(num)
	if abs(float(nd)) >= 1e48:
	raise ValueError("number out of range")
	if "e" in nd:
	raise ValueError("scientific notation is not supported")
	c_symbol = "正负点" if simp else "正負點"
	if o: # formal
	twoalt = False
	if big:
	c_basic = "零壹贰叁肆伍陆柒捌玖" if simp else "零壹貳參肆伍陸柒捌玖"
	c_unit1 = "拾佰仟"
	c_twoalt = "贰" if simp else "貳"
	else:
	c_basic = "〇一二三四五六七八九" if o else "零一二三四五六七八九"
	c_unit1 = "十百千"
	if twoalt:
	c_twoalt = "两" if simp else "兩"
	else:
	c_twoalt = "二"
	c_unit2 = "万亿兆京垓秭穰沟涧正载" if simp else "萬億兆京垓秭穰溝澗正載"
	revuniq = lambda l: "".join(k for k, g in itertools.groupby(reversed(l)))
	nd = str(num)
	result = []
	if nd[0] == "+":
	result.append(c_symbol[0])
	elif nd[0] == "-":
	result.append(c_symbol[1])
	if "." in nd:
	integer, remainder = nd.lstrip("+-").split(".")
	else:
	integer, remainder = nd.lstrip("+-"), None
	if int(integer):
	splitted = [integer[max(i - 4, 0) : i] for i in range(len(integer), 0, -4)]
	intresult = []
	for nu, unit in enumerate(splitted):
	# special cases
	if int(unit) == 0: # 0000
	intresult.append(c_basic[0])
	continue
	if nu > 0 and int(unit) == 2: # 0002
	intresult.append(c_twoalt + c_unit2[nu - 1])
	continue
	ulist = []
	unit = unit.zfill(4)
	for nc, ch in enumerate(reversed(unit)):
	if ch == "0":
	if ulist: # ???0
	ulist.append(c_basic[0])
	elif nc == 0:
	ulist.append(c_basic[int(ch)])
	elif nc == 1 and ch == "1" and unit[1] == "0":
	# special case for tens
	# edit the 'elif' if you don't like
	# 十四, 三千零十四, 三千三百一十四
	ulist.append(c_unit1[0])
	elif nc > 1 and ch == "2":
	ulist.append(c_twoalt + c_unit1[nc - 1])
	else:
	ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
	ustr = revuniq(ulist)
	if nu == 0:
	intresult.append(ustr)
	else:
	intresult.append(ustr + c_unit2[nu - 1])
	result.append(revuniq(intresult).strip(c_basic[0]))
	else:
	result.append(c_basic[0])
	if remainder:
	result.append(c_symbol[2])
	result.append("".join(c_basic[int(ch)] for ch in remainder))
	return "".join(result)


	def _number_replace(match) -> str:
	"""function to apply in a match, transform all numbers in a match by chinese characters

	Args:
	match (re.Match): numbers regex matches

	Returns:
	str: replaced characters for the numbers
	"""
	match_str: str = match.group()
	return _num2chinese(match_str)


	def replace_numbers_to_characters_in_text(text: str) -> str:
	"""Replace all arabic numbers in a text by their equivalent in chinese characters (simplified)

	Args:
	text (str): input text to transform

	Returns:
	str: output text
	"""
	text = re.sub(r"[0-9]+", _number_replace, text)
	return text