Spaces:

Dionyssos
/

speech-analysis2

Running

App Files Files Community

Dionyssos commited on Aug 20

Commit

1c1f53d

1 Parent(s): fd9e569

wav of voices

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +5 -5
app.py +221 -62
espeak_util.py +206 -0
requirements.txt +11 -3
wav/af_ZA_google-nwu_0184.wav +0 -0
wav/af_ZA_google-nwu_1919.wav +0 -0
wav/af_ZA_google-nwu_2418.wav +0 -0
wav/af_ZA_google-nwu_6590.wav +0 -0
wav/af_ZA_google-nwu_7130.wav +0 -0
wav/af_ZA_google-nwu_7214.wav +0 -0
wav/af_ZA_google-nwu_8148.wav +0 -0
wav/af_ZA_google-nwu_8924.wav +0 -0
wav/af_ZA_google-nwu_8963.wav +0 -0
wav/bn_multi_00737.wav +0 -0
wav/bn_multi_00779.wav +0 -0
wav/bn_multi_01232.wav +0 -0
wav/bn_multi_01701.wav +0 -0
wav/bn_multi_03042.wav +0 -0
wav/bn_multi_0834.wav +0 -0
wav/bn_multi_1010.wav +0 -0
wav/bn_multi_3108.wav +0 -0
wav/bn_multi_3713.wav +0 -0
wav/bn_multi_3958.wav +0 -0
wav/bn_multi_4046.wav +0 -0
wav/bn_multi_4811.wav +0 -0
wav/bn_multi_5958.wav +0 -0
wav/bn_multi_9169.wav +0 -0
wav/bn_multi_rm.wav +0 -0
wav/de_DE_m-ailabs_angela_merkel.wav +0 -0
wav/de_DE_m-ailabs_eva_k.wav +0 -0
wav/de_DE_m-ailabs_karlsson.wav +0 -0
wav/de_DE_m-ailabs_ramona_deininger.wav +0 -0
wav/de_DE_m-ailabs_rebecca_braunert_plunkett.wav +0 -0
wav/de_DE_thorsten-emotion_amused.wav +0 -0
wav/el_GR_rapunzelina.wav +0 -0
wav/en_UK_apope.wav +0 -0
wav/en_US_cmu_arctic_aew.wav +0 -0
wav/en_US_cmu_arctic_aup.wav +0 -0
wav/en_US_cmu_arctic_awb.wav +0 -0
wav/en_US_cmu_arctic_awbrms.wav +0 -0
wav/en_US_cmu_arctic_axb.wav +0 -0
wav/en_US_cmu_arctic_bdl.wav +0 -0
wav/en_US_cmu_arctic_clb.wav +0 -0
wav/en_US_cmu_arctic_eey.wav +0 -0
wav/en_US_cmu_arctic_fem.wav +0 -0
wav/en_US_cmu_arctic_gka.wav +0 -0
wav/en_US_cmu_arctic_jmk.wav +0 -0
wav/en_US_cmu_arctic_ksp.wav +0 -0
wav/en_US_cmu_arctic_ljm.wav +0 -0
wav/en_US_cmu_arctic_lnh.wav +0 -0

README.md CHANGED Viewed

@@ -6,13 +6,13 @@ colorTo: gray
 sdk: gradio
 sdk_version: 5.41.1
 app_file: app.py
-pinned: false
 license: cc-by-nc-4.0
 tags:
-- age
-- gender
-- expression
-- audio
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 sdk: gradio
 sdk_version: 5.41.1
 app_file: app.py
+short_description: TTS for CPU
 license: cc-by-nc-4.0
 tags:
+- non-AR
+- affective
+- shift
+- tts
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,22 +1,25 @@
 import typing
 import types  # fusion of forward() of Wav2Vec2
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
-import spaces
 import torch
 import torch.nn as nn
 from transformers import Wav2Vec2Processor
 from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
 from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
 import audiofile
 import audresample
 device = 0 if torch.cuda.is_available() else "cpu"
 duration = 2  # limit processing of audio
-age_gender_model_name = "audeering/wav2vec2-large-robust-24-ft-age-gender"
 expression_model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
@@ -167,7 +170,7 @@ class ExpressionModel(Wav2Vec2PreTrainedModel):
 # Load models from hub
-age_gender_processor = Wav2Vec2Processor.from_pretrained(age_gender_model_name)
 age_gender_model = AgeGenderModel.from_pretrained(age_gender_model_name)
 expression_processor = Wav2Vec2Processor.from_pretrained(expression_model_name)
 expression_model = ExpressionModel.from_pretrained(expression_model_name)
@@ -206,12 +209,9 @@ def process_func(x: np.ndarray, sampling_rate: int) -> typing.Tuple[str, dict, s
         },
         expression_file,
     )
-@spaces.GPU
-def recognize(input_file: str) -> typing.Tuple[str, dict, str]:
-    # sampling_rate, signal = input_microphone
-    # signal = signal.astype(np.float32, order="C") / 32768.0
     if input_file is None:
         raise gr.Error(
             "No audio file submitted! "
@@ -227,50 +227,6 @@ def recognize(input_file: str) -> typing.Tuple[str, dict, str]:
     return process_func(signal, target_rate)
-def plot_expression_RIGID(arousal, dominance, valence):
-    r"""3D pixel plot of arousal, dominance, valence."""
-    # Voxels per dimension
-    voxels = 7
-    # Create voxel grid
-    x, y, z = np.indices((voxels + 1, voxels + 1, voxels + 1))
-    voxel = (
-        (x == round(arousal * voxels))
-        & (y == round(dominance * voxels))
-        & (z == round(valence * voxels))
-    )
-    projection = (
-        (x == round(arousal * voxels))
-        & (y == round(dominance * voxels))
-        & (z < round(valence * voxels))
-    )
-    colors = np.empty((voxel | projection).shape, dtype=object)
-    colors[voxel] = "#fcb06c"
-    colors[projection] = "#fed7a9"
-    ax = plt.figure().add_subplot(projection='3d')
-    ax.voxels(voxel | projection, facecolors=colors, edgecolor='k')
-    ax.set_xlim([0, voxels])
-    ax.set_ylim([0, voxels])
-    ax.set_zlim([0, voxels])
-    ax.set_aspect("equal")
-    ax.set_xlabel("arousal", fontsize="large", labelpad=0)
-    ax.set_ylabel("dominance", fontsize="large", labelpad=0)
-    ax.set_zlabel("valence", fontsize="large", labelpad=0)
-    ax.set_xticks(
-        list(range(voxels + 1)),
-        labels=[0, None, None, None, None, None, None, 1],
-        verticalalignment="bottom",
-    )
-    ax.set_yticks(
-        list(range(voxels + 1)),
-        labels=[0, None, None, None, None, None, None, 1],
-        verticalalignment="bottom",
-    )
-    ax.set_zticks(
-        list(range(voxels + 1)),
-        labels=[0, None, None, None, None, None, None, 1],
-        verticalalignment="top",
-    )
 def explode(data):
     """
     Expands a 3D array by creating gaps between voxels.
@@ -282,6 +238,18 @@ def explode(data):
     retval[::2, ::2, ::2] = data
     return retval
 def plot_expression(arousal, dominance, valence):
     '''_h = cuda tensor (N_PIX, N_PIX, N_PIX)'''
@@ -302,7 +270,8 @@ def plot_expression(arousal, dominance, valence):
     y[:, 1::2, :] += 1
     z[:, :, 1::2] += 1
-    ax = plt.figure().add_subplot(projection='3d')
     f_2 = np.ones([2 * N_PIX - 1,
                    2 * N_PIX - 1,
@@ -313,7 +282,6 @@ def plot_expression(arousal, dominance, valence):
     f_2[:, :, :, 3] = f_2[:, :, :, 3].clip(.01, .74)
-    print(f_2.shape, 'f_2 AAAA')
     ecolors_2 = f_2
     ax.voxels(x, y, z, filled_2, facecolors=f_2, edgecolors=.006 * ecolors_2)
@@ -348,10 +316,156 @@ def plot_expression(arousal, dominance, valence):
     ax.set_xlim(0, N_PIX)
     ax.set_ylim(0, N_PIX)
     ax.set_zlim(0, N_PIX)
-    #plt.show()
-# ------
 description = (
@@ -366,7 +480,52 @@ description = (
 )
 with gr.Blocks() as demo:
-    with gr.Tab(label="Speech analysis"):
         with gr.Row():
             with gr.Column():
                 gr.Markdown(description)
@@ -378,10 +537,10 @@ with gr.Blocks() as demo:
                 )
                 gr.Examples(
                     [
-                        "female-46-neutral.wav",
-                        "female-20-happy.wav",
-                        "male-60-angry.wav",
-                        "male-27-sad.wav",
                     ],
                     [input],
                     label="Examples from CREMA-D, ODbL v1.0 license",

+# -*- coding: utf-8 -*-
 import typing
 import types  # fusion of forward() of Wav2Vec2
 import gradio as gr
 import matplotlib.pyplot as plt
 import numpy as np
+import os
 import torch
 import torch.nn as nn
 from transformers import Wav2Vec2Processor
 from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model
 from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel
 import audiofile
+import unicodedata
+import textwrap
+from tts import StyleTTS2
 import audresample
 device = 0 if torch.cuda.is_available() else "cpu"
 duration = 2  # limit processing of audio
+age_gender_model_name = "audeering/wav2vec2-large-robust-6-ft-age-gender"
 expression_model_name = "audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim"
 # Load models from hub
 age_gender_model = AgeGenderModel.from_pretrained(age_gender_model_name)
 expression_processor = Wav2Vec2Processor.from_pretrained(expression_model_name)
 expression_model = ExpressionModel.from_pretrained(expression_model_name)
         },
         expression_file,
     )
+def recognize(input_file):
     if input_file is None:
         raise gr.Error(
             "No audio file submitted! "
     return process_func(signal, target_rate)
 def explode(data):
     """
     Expands a 3D array by creating gaps between voxels.
     retval[::2, ::2, ::2] = data
     return retval
+def explode(data):
+    """
+    Expands a 3D array by adding new voxels between existing ones.
+    This is used to create the gaps in the 3D plot.
+    """
+    shape = data.shape
+    new_shape = (2 * shape[0] - 1, 2 * shape[1] - 1, 2 * shape[2] - 1)
+    new_data = np.zeros(new_shape, dtype=data.dtype)
+    new_data[::2, ::2, ::2] = data
+    return new_data
 def plot_expression(arousal, dominance, valence):
     '''_h = cuda tensor (N_PIX, N_PIX, N_PIX)'''
     y[:, 1::2, :] += 1
     z[:, :, 1::2] += 1
+    fig = plt.figure()
+    ax = fig.add_subplot(projection='3d')
     f_2 = np.ones([2 * N_PIX - 1,
                    2 * N_PIX - 1,
     f_2[:, :, :, 3] = f_2[:, :, :, 3].clip(.01, .74)
     ecolors_2 = f_2
     ax.voxels(x, y, z, filled_2, facecolors=f_2, edgecolors=.006 * ecolors_2)
     ax.set_xlim(0, N_PIX)
     ax.set_ylim(0, N_PIX)
     ax.set_zlim(0, N_PIX)
+    # plt.show()
+# TTS
+VOICES = [f'wav/{vox}' for vox in os.listdir('wav')]
+_tts = StyleTTS2().to('cpu')
+def only_greek_or_only_latin(text, lang='grc'):
+    '''
+        str: The converted string in the specified target script.
+             Characters not found in any mapping are preserved as is.
+             Latin accented characters in the input (e.g., 'É', 'ü') will
+             be preserved in their lowercase form (e.g., 'é', 'ü') if
+             converting to Latin.
+    '''
+    # --- Mapping Dictionaries ---
+    # Keys are in lowercase as input text is case-folded.
+    # If the output needs to maintain original casing, additional logic is required.
+    latin_to_greek_map = {
+        'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
+        'ch': 'τσο', # Example of a multi-character Latin sequence
+        'z': 'ζ', 'h': 'χ', 'i': 'ι', 'k': 'κ', 'l': 'λ',
+        'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π',
+        'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ',
+        'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ',
+    }
+    greek_to_latin_map = {
+        'ου': 'ou', # Prioritize common diphthongs/digraphs
+        'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e',
+        'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k',
+        'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
+        'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon
+        'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
+        'ς': 's', # Final sigma
+    }
+    cyrillic_to_latin_map = {
+        'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh',
+        'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
+        'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
+        'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
+        'я': 'ya',
+    }
+    # Direct Cyrillic to Greek mapping based on phonetic similarity.
+    # These are approximations and may not be universally accepted transliterations.
+    cyrillic_to_greek_map = {
+        'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ',
+        'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο',
+        'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
+        'ч': 'τσ', # or τζ depending on desired sound
+        'ш': 'σ', 'щ': 'σ', # approximations
+        'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου',
+        'я': 'ια',
+    }
+    # Convert the input text to lowercase, preserving accents for Latin characters.
+    # casefold() is used for more robust caseless matching across Unicode characters.
+    lowercased_text = text.lower()  #casefold()
+    output_chars = []
+    current_index = 0
+    if lang == 'grc':
+        # Combine all relevant maps for direct lookup to Greek
+        conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map}
+        # Sort keys by length in reverse order to handle multi-character sequences first
+        sorted_source_keys = sorted(
+            list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()),
+            key=len,
+            reverse=True
+        )
+        while current_index < len(lowercased_text):
+            found_conversion = False
+            for key in sorted_source_keys:
+                if lowercased_text.startswith(key, current_index):
+                    output_chars.append(conversion_map[key])
+                    current_index += len(key)
+                    found_conversion = True
+                    break
+            if not found_conversion:
+                # If no specific mapping found, append the character as is.
+                # This handles unmapped characters and already Greek characters.
+                output_chars.append(lowercased_text[current_index])
+                current_index += 1
+        return ''.join(output_chars)
+    else: # Default to 'lat' conversion
+        # Combine Greek to Latin and Cyrillic to Latin maps.
+        # Cyrillic map keys will take precedence in case of overlap if defined after Greek.
+        combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map}
+        # Sort all relevant source keys by length in reverse for replacement
+        sorted_source_keys = sorted(
+            list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()),
+            key=len,
+            reverse=True
+        )
+        while current_index < len(lowercased_text):
+            found_conversion = False
+            for key in sorted_source_keys:
+                if lowercased_text.startswith(key, current_index):
+                    latin_equivalent = combined_to_latin_map[key]
+                    # Strip accents ONLY if the source character was from the Greek map.
+                    # This preserves accents on original Latin characters (like 'é')
+                    # and allows for intentional accent stripping from Greek transliterations.
+                    if key in greek_to_latin_map:
+                        normalized_latin = unicodedata.normalize('NFD', latin_equivalent)
+                        stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c))
+                        output_chars.append(stripped_latin)
+                    else:
+                        output_chars.append(latin_equivalent)
+                    current_index += len(key)
+                    found_conversion = True
+                    break
+            if not found_conversion:
+                # If no conversion happened from Greek or Cyrillic, append the character as is.
+                # This preserves existing Latin characters (including accented ones from input),
+                # numbers, punctuation, and other symbols.
+                output_chars.append(lowercased_text[current_index])
+                current_index += 1
+        return ''.join(output_chars)
+def other_tts(text='Hallov worlds Far over the',
+              ref_s='wav/af_ZA_google-nwu_0184.wav'):
+    text = only_greek_or_only_latin(text, lang='eng')
+    x = _tts.inference(text, ref_s=ref_s)[0, 0, :].cpu().numpy()
+    # x /= np.abs(x).max() + 1e-7  ~ Volume normalisation @api.py:tts_multi_sentence() OR demo.py
+    tmp_file = f'_speech.wav'  # N x clients (cleanup vs tmp file / client)
+    audiofile.write(tmp_file, x, 24000)
+    return tmp_file
+def update_selected_voice(voice_filename):
+    return 'wav/' + voice_filename + '.wav'
 description = (
 )
 with gr.Blocks() as demo:
+    with gr.Tab(label="other TTS"):
+        selected_voice = gr.State(value='wav/en_US_m-ailabs_mary_ann.wav')
+        with gr.Row():
+            voice_info = gr.Markdown(f'TTS vox : `{selected_voice.value}`')
+        # Main input and output components
+        with gr.Row():
+            text_input = gr.Textbox(
+                label="Enter text for TTS:",
+                placeholder="Type your message here...",
+                lines=4,
+                value="Farover the misty mountains cold too dungeons deep and caverns old.",
+            )
+            generate_button = gr.Button("Generate Audio", variant="primary")
+        output_audio = gr.Audio(label="TTS Output")
+        with gr.Column():
+            voice_buttons = []
+            for i in range(0, len(VOICES), 7):
+                with gr.Row():
+                    for voice_filename in VOICES[i:i+7]:
+                        voice_filename = voice_filename[4:-4]  # drop wav/ for visibility
+                        button = gr.Button(voice_filename)
+                        button.click(
+                            fn=update_selected_voice,
+                            inputs=[gr.Textbox(value=voice_filename, visible=False)],
+                            outputs=[selected_voice]
+                        )
+                        button.click(
+                            fn=lambda v=voice_filename: f"TTS Vox = `{v}`",
+                            inputs=None,
+                            outputs=voice_info
+                        )
+                        voice_buttons.append(button)
+        generate_button.click(
+            fn=other_tts,
+            inputs=[text_input, selected_voice],
+            outputs=output_audio
+        )
+    with gr.Tab(label="Speech Analysis"):
         with gr.Row():
             with gr.Column():
                 gr.Markdown(description)
                 )
                 gr.Examples(
                     [
+                        "wav/female-46-neutral.wav",
+                        "wav/female-20-happy.wav",
+                        "wav/male-60-angry.wav",
+                        "wav/male-27-sad.wav",
                     ],
                     [input],
                     label="Examples from CREMA-D, ODbL v1.0 license",

espeak_util.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import platform
+import subprocess
+import shutil
+from pathlib import Path
+import os
+from typing import Optional, Tuple
+from phonemizer.backend.espeak.wrapper import EspeakWrapper
+class EspeakConfig:
+    """Utility class for configuring espeak-ng library and binary."""
+    @staticmethod
+    def find_espeak_binary() -> tuple[bool, Optional[str]]:
+        """
+        Find espeak-ng binary using multiple methods.
+        Returns:
+            tuple: (bool indicating if espeak is available, path to espeak binary if found)
+        """
+        # Common binary names
+        binary_names = ["espeak-ng", "espeak"]
+        if platform.system() == "Windows":
+            binary_names = ["espeak-ng.exe", "espeak.exe"]
+        # Common installation directories for Linux
+        linux_paths = [
+            "/usr/bin",
+            "/usr/local/bin",
+            "/usr/lib/espeak-ng",
+            "/usr/local/lib/espeak-ng",
+            "/opt/espeak-ng/bin",
+        ]
+        # First check if it's in PATH
+        for name in binary_names:
+            espeak_path = shutil.which(name)
+            if espeak_path:
+                return True, espeak_path
+        # For Linux, check common installation directories
+        if platform.system() == "Linux":
+            for directory in linux_paths:
+                for name in binary_names:
+                    path = Path(directory) / name
+                    if path.exists():
+                        return True, str(path)
+        # Try running the command directly as a last resort
+        try:
+            subprocess.run(
+                ["espeak-ng", "--version"],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                check=True,
+            )
+            return True, "espeak-ng"
+        except (subprocess.SubprocessError, FileNotFoundError):
+            pass
+        return False, None
+    @staticmethod
+    def find_library_path() -> Optional[str]:
+        """
+        Find the espeak-ng library using multiple search methods.
+        Returns:
+            Optional[str]: Path to the library if found, None otherwise
+        """
+        system = platform.system()
+        if system == "Linux":
+            lib_names = ["libespeak-ng.so", "libespeak-ng.so.1"]
+            common_paths = [
+                # Debian/Ubuntu paths
+                "/usr/lib/x86_64-linux-gnu",
+                "/usr/lib/aarch64-linux-gnu",  # For ARM64
+                "/usr/lib/arm-linux-gnueabihf",  # For ARM32
+                "/usr/lib",
+                "/usr/local/lib",
+                # Fedora/RHEL paths
+                "/usr/lib64",
+                "/usr/lib32",
+                # Common additional paths
+                "/usr/lib/espeak-ng",
+                "/usr/local/lib/espeak-ng",
+                "/opt/espeak-ng/lib",
+            ]
+            # Check common locations first
+            for path in common_paths:
+                for lib_name in lib_names:
+                    lib_path = Path(path) / lib_name
+                    if lib_path.exists():
+                        return str(lib_path)
+            # Search system library paths
+            try:
+                # Use ldconfig to find the library
+                result = subprocess.run(
+                    ["ldconfig", "-p"], capture_output=True, text=True, check=True
+                )
+                for line in result.stdout.splitlines():
+                    if "libespeak-ng.so" in line:
+                        # Extract path from ldconfig output
+                        return line.split("=>")[-1].strip()
+            except (subprocess.SubprocessError, FileNotFoundError):
+                pass
+        elif system == "Darwin":  # macOS
+            common_paths = [
+                Path("/opt/homebrew/lib/libespeak-ng.dylib"),
+                Path("/usr/local/lib/libespeak-ng.dylib"),
+                *list(
+                    Path("/opt/homebrew/Cellar/espeak-ng").glob(
+                        "*/lib/libespeak-ng.dylib"
+                    )
+                ),
+                *list(
+                    Path("/usr/local/Cellar/espeak-ng").glob("*/lib/libespeak-ng.dylib")
+                ),
+            ]
+            for path in common_paths:
+                if path.exists():
+                    return str(path)
+        elif system == "Windows":
+            common_paths = [
+                Path(os.environ.get("PROGRAMFILES", "C:\\Program Files"))
+                / "eSpeak NG"
+                / "libespeak-ng.dll",
+                Path(os.environ.get("PROGRAMFILES(X86)", "C:\\Program Files (x86)"))
+                / "eSpeak NG"
+                / "libespeak-ng.dll",
+                *[
+                    Path(p) / "libespeak-ng.dll"
+                    for p in os.environ.get("PATH", "").split(os.pathsep)
+                ],
+            ]
+            for path in common_paths:
+                if path.exists():
+                    return str(path)
+        return None
+    @classmethod
+    def configure_espeak(cls) -> Tuple[bool, str]:
+        """
+        Configure espeak-ng for use with the phonemizer.
+        Returns:
+            Tuple[bool, str]: (Success status, Status message)
+        """
+        # First check if espeak binary is available
+        espeak_available, espeak_path = cls.find_espeak_binary()
+        if not espeak_available:
+            raise FileNotFoundError(
+                "Could not find espeak-ng binary. Please install espeak-ng:\n"
+                "Ubuntu/Debian: sudo apt-get install espeak-ng espeak-ng-data\n"
+                "Fedora: sudo dnf install espeak-ng\n"
+                "Arch: sudo pacman -S espeak-ng\n"
+                "MacOS: brew install espeak-ng\n"
+                "Windows: Download from https://github.com/espeak-ng/espeak-ng/releases"
+            )
+        # Find the library
+        library_path = cls.find_library_path()
+        if not library_path:
+            # On Linux, we might not need to explicitly set the library path
+            if platform.system() == "Linux":
+                return True, f"Using system espeak-ng installation at: {espeak_path}"
+            else:
+                raise FileNotFoundError(
+                    "Could not find espeak-ng library. Please ensure espeak-ng is properly installed."
+                )
+        # Try to set the library path
+        try:
+            EspeakWrapper.set_library(library_path)
+            return True, f"Successfully configured espeak-ng library at: {library_path}"
+        except Exception as e:
+            if platform.system() == "Linux":
+                # On Linux, try to continue without explicit library path
+                return True, f"Using system espeak-ng installation at: {espeak_path}"
+            else:
+                raise RuntimeError(f"Failed to configure espeak-ng library: {str(e)}")
+def setup_espeak():
+    """
+    Set up espeak-ng for use with the phonemizer.
+    Raises appropriate exceptions if setup fails.
+    """
+    try:
+        success, message = EspeakConfig.configure_espeak()
+        print(message)
+    except Exception as e:
+        print(f"Error configuring espeak-ng: {str(e)}")
+        raise
+# Replace the original set_espeak_library function with this
+set_espeak_library = setup_espeak

requirements.txt CHANGED Viewed

@@ -1,5 +1,13 @@
-audiofile
-audresample
-matplotlib
 torch
 transformers

 torch
+nltk
+pydantic==2.10.6
+librosa
 transformers
+phonemizer
+audiofile
+matplotlib
+numpy<2.0.0
+gradio==5.27.0
+audresample

wav/af_ZA_google-nwu_0184.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_1919.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_2418.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_6590.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_7130.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_7214.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_8148.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_8924.wav ADDED Viewed

Binary file (92 kB). View file

wav/af_ZA_google-nwu_8963.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_00737.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_00779.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_01232.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_01701.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_03042.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_0834.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_1010.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_3108.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_3713.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_3958.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_4046.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_4811.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_5958.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_9169.wav ADDED Viewed

Binary file (92 kB). View file

wav/bn_multi_rm.wav ADDED Viewed

Binary file (92 kB). View file

wav/de_DE_m-ailabs_angela_merkel.wav ADDED Viewed

Binary file (90.7 kB). View file

wav/de_DE_m-ailabs_eva_k.wav ADDED Viewed

Binary file (92.7 kB). View file

wav/de_DE_m-ailabs_karlsson.wav ADDED Viewed

Binary file (92.7 kB). View file

wav/de_DE_m-ailabs_ramona_deininger.wav ADDED Viewed

Binary file (91.2 kB). View file

wav/de_DE_m-ailabs_rebecca_braunert_plunkett.wav ADDED Viewed

Binary file (91.2 kB). View file

wav/de_DE_thorsten-emotion_amused.wav ADDED Viewed

Binary file (92 kB). View file

wav/el_GR_rapunzelina.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_UK_apope.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_aew.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_aup.wav ADDED Viewed

Binary file (94.3 kB). View file

wav/en_US_cmu_arctic_awb.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_awbrms.wav ADDED Viewed

Binary file (92.7 kB). View file

wav/en_US_cmu_arctic_axb.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_bdl.wav ADDED Viewed

Binary file (94.8 kB). View file

wav/en_US_cmu_arctic_clb.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_eey.wav ADDED Viewed

Binary file (95.3 kB). View file

wav/en_US_cmu_arctic_fem.wav ADDED Viewed

Binary file (94.8 kB). View file

wav/en_US_cmu_arctic_gka.wav ADDED Viewed

Binary file (95.3 kB). View file

wav/en_US_cmu_arctic_jmk.wav ADDED Viewed

Binary file (93.2 kB). View file

wav/en_US_cmu_arctic_ksp.wav ADDED Viewed

Binary file (92 kB). View file

wav/en_US_cmu_arctic_ljm.wav ADDED Viewed

Binary file (94.3 kB). View file

wav/en_US_cmu_arctic_lnh.wav ADDED Viewed

Binary file (94.8 kB). View file