PDF_reader

Paused

App Files Files Community

Echo9k commited on 25 days ago

Commit

6b438f3

1 Parent(s): d0abfe5

TTS solved

Browse files

Files changed (2) hide show

app.py +49 -36
tts.py +38 -35

app.py CHANGED Viewed

@@ -1,55 +1,64 @@
 # app.py
 import os
 import gradio as gr
-import logging
-import tempfile
 from gradio_pdf import PDF
-from config import config
 from model import model_initialized
 from pdf_processor import to_pdf, to_markdown
-from tts import text_to_speech_openai, text_to_speech_gtts
-# Set up logging
-logging.basicConfig(level=logging.INFO)
 # Load header HTML content
-with open("header.html", "r", encoding="utf-8") as file:
-    header = file.read()
-# Define language options (could also be moved to config.yaml)
-latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
-              'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
-              'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
-              'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
 arabic_lang = ['ar', 'fa', 'ug', 'ur']
-cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
-                 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
-devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
-                   'sa', 'bgc']
 other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
 all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
-# Define a function to convert a file to a PDF (if not already)
 def file_to_pdf(file_obj):
     if file_obj is not None:
-        return to_pdf(file_obj.name)
     return None
-# Define a function to handle TTS using OpenAI (with fallback)
-def read_text(text, language="en"):
     """
-    Attempts to synthesize speech from text using OpenAI TTS,
-    falling back to gTTS if an error occurs.
     """
-    try:
-        text_to_speech_openai(text, language)
-    except Exception as e:
-        logging.error("OpenAI TTS failed: %s. Falling back to gTTS.", e)
-        text_to_speech_gtts(text, language)
-    return "Audio played successfully"
-# Set up the Gradio Blocks interface
 with gr.Blocks() as demo:
     gr.HTML(header)
     with gr.Row():
@@ -86,11 +95,10 @@ with gr.Blocks() as demo:
                     md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
                 with gr.Tab("Markdown text"):
                     md_text = gr.TextArea(lines=45, show_copy_button=True)
-            # TTS components
-            read_button = gr.Button("Read Out Loud")
-            read_status = gr.Textbox(label="TTS Status")
-    # Define interactions
     file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
     convert_button.click(
@@ -99,7 +107,12 @@ with gr.Blocks() as demo:
         outputs=[md_render, md_text, output_file, pdf_display]
     )
-    read_button.click(fn=read_text, inputs=[md_text, language], outputs=read_status)
     clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])

 # app.py
 import os
 import gradio as gr
 from gradio_pdf import PDF
+import logging
 from model import model_initialized
 from pdf_processor import to_pdf, to_markdown
+from config import config
+from tts import text_to_speech  # Import TTS module
+# Set up logging with ANSI escape codes for colored output
+logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
+def log_info(message: str):
+    logging.info(f"\033[92m{message}\033[0m")  # Green for info
+def log_error(message: str):
+    logging.error(f"\033[91m{message}\033[0m")  # Red for errors
 # Load header HTML content
+try:
+    with open("header.html", "r") as file:
+        header = file.read()
+    log_info("Header loaded successfully.")
+except Exception as e:
+    log_error(f"Failed to load header.html. Error: {e}")
+    header = "<h1>Header not found</h1>"
+# Define language options
+latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
 arabic_lang = ['ar', 'fa', 'ug', 'ur']
+cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
+devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', 'sa', 'bgc']
 other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
 all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
 def file_to_pdf(file_obj):
     if file_obj is not None:
+        try:
+            pdf_path = to_pdf(file_obj.name)
+            log_info("File converted to PDF successfully.")
+            return pdf_path
+        except Exception as e:
+            log_error(f"Error converting file to PDF: {e}")
     return None
+def generate_audio(text: str) -> str:
     """
+    Converts the provided text to speech and returns the path of the audio file.
     """
+    if text:
+        try:
+            audio_file = text_to_speech(text)
+            log_info("Audio generated successfully.")
+            return audio_file
+        except Exception as e:
+            log_error(f"Audio generation failed: {e}")
+            return ""
+    log_error("No text provided for TTS.")
+    return ""
 with gr.Blocks() as demo:
     gr.HTML(header)
     with gr.Row():
                     md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
                 with gr.Tab("Markdown text"):
                     md_text = gr.TextArea(lines=45, show_copy_button=True)
+            # Audio component for TTS playback
+            audio_output = gr.Audio(label="Read Aloud", type="filepath")
+            read_button = gr.Button("Read Aloud")
     file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
     convert_button.click(
         outputs=[md_render, md_text, output_file, pdf_display]
     )
+    # When "Read Aloud" is clicked, generate audio from the markdown text
+    read_button.click(
+        fn=generate_audio,
+        inputs=md_text,
+        outputs=audio_output
+    )
     clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])

tts.py CHANGED Viewed

@@ -1,46 +1,49 @@
 # tts.py
 import os
-import tempfile
-import requests
-from playsound import playsound
-def text_to_speech_openai(text, language="en"):
     """
-    Convert text to speech using a hypothetical OpenAI TTS API.
-    Note: OpenAI Whisper is for speech recognition.
-    Replace the endpoint and parameters with actual API details when available.
     """
-    import openai
-    api_key = os.getenv("api_key_oai")
-    if not api_key:
-        raise ValueError("API key for OpenAI TTS not found in environment variable 'api_key_oai'")
-    openai.api_key = api_key
     try:
-        # Hypothetical API call -- adjust the engine name and parameters as per actual API documentation.
-        response = openai.Audio.synthesize(
-            engine="tts",      # Hypothetical engine name for TTS
-            text=text,
-            language=language
         )
-        audio_url = response["audio_url"]
     except Exception as e:
-        raise RuntimeError(f"OpenAI TTS synthesis failed: {e}")
-    # Download and play the audio
-    audio_data = requests.get(audio_url).content
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-        tmp_file.write(audio_data)
-        tmp_file_path = tmp_file.name
-    playsound(tmp_file_path)
-def text_to_speech_gtts(text, language="en"):
     """
-    Fallback text-to-speech using the gTTS library.
     """
-    from gtts import gTTS
-    tts = gTTS(text=text, lang=language)
-    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
-        tts.save(tmp_file.name)
-        tmp_file_path = tmp_file.name
-    playsound(tmp_file_path)

 # tts.py
 import os
+from pathlib import Path
+import openai
+import logging
+from gtts import gTTS  # Ensure gTTS is installed (pip install gTTS)
+# Set OpenAI API key from the environment variable
+openai.api_key = os.getenv("api_key_oai")
+def text_to_speech(text: str, voice: str = "coral", model: str = "tts-1") -> str:
     """
+    Convert input text to speech using OpenAI's TTS API.
+    Falls back to gTTS if the OpenAI API fails.
+    Returns:
+        The file path to the generated audio file.
     """
+    # Generate a unique filename using a hash of the text
+    output_file = Path(__file__).parent / f"speech_{abs(hash(text))}.pus"
     try:
+        response = openai.Audio.speech.create(
+            model=model,
+            voice=voice,
+            input=text,
         )
+        response.stream_to_file(str(output_file))
+        logging.info("OpenAI TTS succeeded.")
+        return str(output_file)
     except Exception as e:
+        logging.error("OpenAI TTS failed, falling back to gTTS. Error: %s", e)
+        return text_to_speech_gtts(text)
+def text_to_speech_gtts(text: str) -> str:
     """
+    Convert input text to speech using gTTS.
+    Returns:
+        The file path to the generated audio file.
     """
+    output_file = Path(__file__).parent / f"speech_{abs(hash(text))}.mp3"
+    try:
+        tts = gTTS(text=text, lang='en')
+        tts.save(str(output_file))
+        logging.info("gTTS succeeded.")
+        return str(output_file)
+    except Exception as e:
+        logging.error("gTTS failed. Error: %s", e)
+        raise