Spaces:

robinhad
/

ukrainian-tts

Running

App Files Files Community

codevision commited on Aug 16, 2023

Commit

7b54849

unverified ·

1 Parent(s): 7a439aa

Account for different forms of currency name (#33)

Browse files

Files changed (4) hide show

requirements.txt +1 -1
setup.py +1 -1
tests/test_formatter.py +24 -18
ukrainian_tts/formatter.py +40 -5

requirements.txt CHANGED Viewed

@@ -2,7 +2,7 @@
 torch
 espnet>=202301
 typeguard<3 # typeguard 3.0.0 is incompatible with espnet
-num2words==0.5.12
 ukrainian-word-stress==1.0.2
 git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c
 gradio # 3.34

 torch
 espnet>=202301
 typeguard<3 # typeguard 3.0.0 is incompatible with espnet
+git+https://github.com/kant2002/num2words.git@kant/add-cases
 ukrainian-word-stress==1.0.2
 git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c
 gradio # 3.34

setup.py CHANGED Viewed

@@ -14,7 +14,7 @@ setup(
     install_requires=[
         "espnet>=202301",
         "typeguard<3",
-        "num2words==0.5.12",
         "ukrainian-word-stress==1.0.1",
         "ukrainian_accentor @ git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c",
     ],

     install_requires=[
         "espnet>=202301",
         "typeguard<3",
+        "num2words @ git+https://github.com/kant2002/num2words.git@kant/add-cases",
         "ukrainian-word-stress==1.0.1",
         "ukrainian_accentor @ git+https://github.com/egorsmkv/ukrainian-accentor.git@5b7971c4e135e3ff3283336962e63fc0b1c80f4c",
     ],

tests/test_formatter.py CHANGED Viewed

@@ -1,20 +1,26 @@
 from ukrainian_tts.formatter import preprocess_text
-def test_formatter():
-    examples = [
-        ("Quality of life update", "кваліті оф ліфе юпдате"),
-        ("Він украв 20000000 $", "він украв двадцять мільйонів долар"),
-        (
-            "111 000 000 000 доларів державного боргу.",
-            "сто одинадцять мільярдів доларів державного боргу.",
-        ),
-        (
-            "11100000001 доларів державного боргу.",
-            "одинадцять мільярдів сто мільйонів один доларів державного боргу.",
-        ),
-        ("це 19-річне вино.", "це дев'ятнадцять-річне вино."),
-        ("10-30-40-50-5-9-5", "десять-тридцять-сорок-п'ятдесят-п'ять-дев'ять-п'ять"),
-    ]
-    for item in examples:
-        assert preprocess_text(item[0]) == item[1]

 from ukrainian_tts.formatter import preprocess_text
+import pytest
+@pytest.mark.parametrize('text,expected', [
+    ("Quality of life update", "кваліті оф ліфе юпдате"),
+    ("Він украв 20000000 $", "він украв двадцять мільйонів доларів"),
+    ("Він украв 20000000", "він украв двадцять мільйонів"),
+    ("Він украв 1 $", "він украв один долар"),
+    ("Він украв 2 $", "він украв два долари"),
+    ("Він украв 2 ₴", "він украв дві гривні"),
+    (
+        "111 000 000 000 доларів державного боргу.",
+        "сто одинадцять мільярдів доларів державного боргу.",
+    ),
+    (
+        "11100000001 доларів державного боргу.",
+        "одинадцять мільярдів сто мільйонів один доларів державного боргу.",
+    ),
+    # this is wrong case, should be "це дев'ятнадцяти-річне вино."
+    # Implementing this, require to have proper parsing of words into the token stream
+    # which reqiure reworking of current approach.
+    ("це 19-річне вино.", "це дев'ятнадцять-річне вино."),
+    ("10-30-40-50-5-9-5", "десять-тридцять-сорок-п'ятдесят-п'ять-дев'ять-п'ять"),
+])
+def test_formatter(text, expected):
+    assert preprocess_text(text) == expected

ukrainian_tts/formatter.py CHANGED Viewed

@@ -1,13 +1,37 @@
-import num2words
 import re
 def preprocess_text(text):
     text = text.lower()
     # currencies
-    text = text.replace("$", "долар")
-    text = text.replace("₴", "гривня")
-    text = text.replace("€", "євро")
     # replace apostrophe
     text = text.replace("`", "'")
     text = text.replace("ʼ", "'")
@@ -32,12 +56,15 @@ def preprocess_text(text):
     def detect_num_and_convert(word):
         numbers = "0123456789,."
         result = []
         parts = word.split("-")  # for handling complex words
         for part in parts:
             is_number = all(map(lambda x: x in numbers, part))
             if is_number:
                 try:
-                    result.append(num2words.num2words(part, lang="uk"))
                 except:
                     result.append(part)
             else:
@@ -46,6 +73,14 @@ def preprocess_text(text):
     # print([detect_num_and_convert(word) for word in text.split(" ")])
     text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
     # fallback numbers
     text = text.replace("1", "один ")

+from num2words import num2words
 import re
+def number_form(number):
+    if number[-1] == "1":
+        return 0
+    elif number[-1] in ("2", "3", "4"):
+        return 1
+    else:
+        return 2
+CURRENCY = {
+    'USD': ('долар', 'долари', 'доларів'),
+    'UAH': ('гривня', 'гривні', 'гривень'),
+    'EUR': ('євро', 'євро', 'євро'),
+}
 def preprocess_text(text):
     text = text.lower()
     # currencies
+    if "$" in text:
+        currency = "USD"
+        gender = 'masculine'
+    elif "₴" in text:
+        currency = "UAH"
+        gender = 'feminine'
+    elif "€" in text:
+        currency = "EUR"
+        gender = 'masculine'
+    else:
+        currency = ""
+        gender = 'masculine'
+    num_form = 0
     # replace apostrophe
     text = text.replace("`", "'")
     text = text.replace("ʼ", "'")
     def detect_num_and_convert(word):
         numbers = "0123456789,."
         result = []
+        nonlocal num_form
         parts = word.split("-")  # for handling complex words
         for part in parts:
             is_number = all(map(lambda x: x in numbers, part))
             if is_number:
                 try:
+                    num_form = number_form(part)
+                    print("-" + part + "-" + str(num_form))
+                    result.append(num2words(part, lang="uk", gender=gender))
                 except:
                     result.append(part)
             else:
     # print([detect_num_and_convert(word) for word in text.split(" ")])
     text = " ".join([detect_num_and_convert(word) for word in text.split(" ")])
+    if (currency == 'USD'):
+        text = text.replace("$", CURRENCY[currency][num_form])
+    if (currency == 'UAH'):
+        text = text.replace("₴", CURRENCY[currency][num_form])
+    if (currency == 'EUR'):
+        text = text.replace("€", CURRENCY[currency][num_form])
     # fallback numbers
     text = text.replace("1", "один ")