Echo9k commited on
Commit
6b438f3
·
1 Parent(s): d0abfe5

TTS solved

Browse files
Files changed (2) hide show
  1. app.py +49 -36
  2. tts.py +38 -35
app.py CHANGED
@@ -1,55 +1,64 @@
1
  # app.py
2
  import os
3
  import gradio as gr
4
- import logging
5
- import tempfile
6
  from gradio_pdf import PDF
7
- from config import config
8
  from model import model_initialized
9
  from pdf_processor import to_pdf, to_markdown
10
- from tts import text_to_speech_openai, text_to_speech_gtts
 
11
 
12
- # Set up logging
13
- logging.basicConfig(level=logging.INFO)
 
 
 
 
 
 
14
 
15
  # Load header HTML content
16
- with open("header.html", "r", encoding="utf-8") as file:
17
- header = file.read()
 
 
 
 
 
18
 
19
- # Define language options (could also be moved to config.yaml)
20
- latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
21
- 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
22
- 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
23
- 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
24
  arabic_lang = ['ar', 'fa', 'ug', 'ur']
25
- cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
26
- 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
27
- devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
28
- 'sa', 'bgc']
29
  other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
30
-
31
  all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
32
 
33
- # Define a function to convert a file to a PDF (if not already)
34
  def file_to_pdf(file_obj):
35
  if file_obj is not None:
36
- return to_pdf(file_obj.name)
 
 
 
 
 
37
  return None
38
 
39
- # Define a function to handle TTS using OpenAI (with fallback)
40
- def read_text(text, language="en"):
41
  """
42
- Attempts to synthesize speech from text using OpenAI TTS,
43
- falling back to gTTS if an error occurs.
44
  """
45
- try:
46
- text_to_speech_openai(text, language)
47
- except Exception as e:
48
- logging.error("OpenAI TTS failed: %s. Falling back to gTTS.", e)
49
- text_to_speech_gtts(text, language)
50
- return "Audio played successfully"
 
 
 
 
51
 
52
- # Set up the Gradio Blocks interface
53
  with gr.Blocks() as demo:
54
  gr.HTML(header)
55
  with gr.Row():
@@ -86,11 +95,10 @@ with gr.Blocks() as demo:
86
  md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
87
  with gr.Tab("Markdown text"):
88
  md_text = gr.TextArea(lines=45, show_copy_button=True)
89
- # TTS components
90
- read_button = gr.Button("Read Out Loud")
91
- read_status = gr.Textbox(label="TTS Status")
92
 
93
- # Define interactions
94
  file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
95
 
96
  convert_button.click(
@@ -99,7 +107,12 @@ with gr.Blocks() as demo:
99
  outputs=[md_render, md_text, output_file, pdf_display]
100
  )
101
 
102
- read_button.click(fn=read_text, inputs=[md_text, language], outputs=read_status)
 
 
 
 
 
103
 
104
  clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
105
 
 
1
  # app.py
2
  import os
3
  import gradio as gr
 
 
4
  from gradio_pdf import PDF
5
+ import logging
6
  from model import model_initialized
7
  from pdf_processor import to_pdf, to_markdown
8
+ from config import config
9
+ from tts import text_to_speech # Import TTS module
10
 
11
+ # Set up logging with ANSI escape codes for colored output
12
+ logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
13
+
14
+ def log_info(message: str):
15
+ logging.info(f"\033[92m{message}\033[0m") # Green for info
16
+
17
+ def log_error(message: str):
18
+ logging.error(f"\033[91m{message}\033[0m") # Red for errors
19
 
20
  # Load header HTML content
21
+ try:
22
+ with open("header.html", "r") as file:
23
+ header = file.read()
24
+ log_info("Header loaded successfully.")
25
+ except Exception as e:
26
+ log_error(f"Failed to load header.html. Error: {e}")
27
+ header = "<h1>Header not found</h1>"
28
 
29
+ # Define language options
30
+ latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
 
 
 
31
  arabic_lang = ['ar', 'fa', 'ug', 'ur']
32
+ cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
33
+ devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', 'sa', 'bgc']
 
 
34
  other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
 
35
  all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
36
 
 
37
  def file_to_pdf(file_obj):
38
  if file_obj is not None:
39
+ try:
40
+ pdf_path = to_pdf(file_obj.name)
41
+ log_info("File converted to PDF successfully.")
42
+ return pdf_path
43
+ except Exception as e:
44
+ log_error(f"Error converting file to PDF: {e}")
45
  return None
46
 
47
+ def generate_audio(text: str) -> str:
 
48
  """
49
+ Converts the provided text to speech and returns the path of the audio file.
 
50
  """
51
+ if text:
52
+ try:
53
+ audio_file = text_to_speech(text)
54
+ log_info("Audio generated successfully.")
55
+ return audio_file
56
+ except Exception as e:
57
+ log_error(f"Audio generation failed: {e}")
58
+ return ""
59
+ log_error("No text provided for TTS.")
60
+ return ""
61
 
 
62
  with gr.Blocks() as demo:
63
  gr.HTML(header)
64
  with gr.Row():
 
95
  md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
96
  with gr.Tab("Markdown text"):
97
  md_text = gr.TextArea(lines=45, show_copy_button=True)
98
+ # Audio component for TTS playback
99
+ audio_output = gr.Audio(label="Read Aloud", type="filepath")
100
+ read_button = gr.Button("Read Aloud")
101
 
 
102
  file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
103
 
104
  convert_button.click(
 
107
  outputs=[md_render, md_text, output_file, pdf_display]
108
  )
109
 
110
+ # When "Read Aloud" is clicked, generate audio from the markdown text
111
+ read_button.click(
112
+ fn=generate_audio,
113
+ inputs=md_text,
114
+ outputs=audio_output
115
+ )
116
 
117
  clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
118
 
tts.py CHANGED
@@ -1,46 +1,49 @@
1
  # tts.py
2
  import os
3
- import tempfile
4
- import requests
5
- from playsound import playsound
 
6
 
7
- def text_to_speech_openai(text, language="en"):
 
 
 
8
  """
9
- Convert text to speech using a hypothetical OpenAI TTS API.
10
- Note: OpenAI Whisper is for speech recognition.
11
- Replace the endpoint and parameters with actual API details when available.
 
 
12
  """
13
- import openai
14
- api_key = os.getenv("api_key_oai")
15
- if not api_key:
16
- raise ValueError("API key for OpenAI TTS not found in environment variable 'api_key_oai'")
17
- openai.api_key = api_key
18
-
19
  try:
20
- # Hypothetical API call -- adjust the engine name and parameters as per actual API documentation.
21
- response = openai.Audio.synthesize(
22
- engine="tts", # Hypothetical engine name for TTS
23
- text=text,
24
- language=language
25
  )
26
- audio_url = response["audio_url"]
 
 
27
  except Exception as e:
28
- raise RuntimeError(f"OpenAI TTS synthesis failed: {e}")
29
-
30
- # Download and play the audio
31
- audio_data = requests.get(audio_url).content
32
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
33
- tmp_file.write(audio_data)
34
- tmp_file_path = tmp_file.name
35
- playsound(tmp_file_path)
36
 
37
- def text_to_speech_gtts(text, language="en"):
38
  """
39
- Fallback text-to-speech using the gTTS library.
 
 
 
40
  """
41
- from gtts import gTTS
42
- tts = gTTS(text=text, lang=language)
43
- with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
44
- tts.save(tmp_file.name)
45
- tmp_file_path = tmp_file.name
46
- playsound(tmp_file_path)
 
 
 
 
1
  # tts.py
2
  import os
3
+ from pathlib import Path
4
+ import openai
5
+ import logging
6
+ from gtts import gTTS # Ensure gTTS is installed (pip install gTTS)
7
 
8
+ # Set OpenAI API key from the environment variable
9
+ openai.api_key = os.getenv("api_key_oai")
10
+
11
+ def text_to_speech(text: str, voice: str = "coral", model: str = "tts-1") -> str:
12
  """
13
+ Convert input text to speech using OpenAI's TTS API.
14
+ Falls back to gTTS if the OpenAI API fails.
15
+
16
+ Returns:
17
+ The file path to the generated audio file.
18
  """
19
+ # Generate a unique filename using a hash of the text
20
+ output_file = Path(__file__).parent / f"speech_{abs(hash(text))}.pus"
 
 
 
 
21
  try:
22
+ response = openai.Audio.speech.create(
23
+ model=model,
24
+ voice=voice,
25
+ input=text,
 
26
  )
27
+ response.stream_to_file(str(output_file))
28
+ logging.info("OpenAI TTS succeeded.")
29
+ return str(output_file)
30
  except Exception as e:
31
+ logging.error("OpenAI TTS failed, falling back to gTTS. Error: %s", e)
32
+ return text_to_speech_gtts(text)
 
 
 
 
 
 
33
 
34
+ def text_to_speech_gtts(text: str) -> str:
35
  """
36
+ Convert input text to speech using gTTS.
37
+
38
+ Returns:
39
+ The file path to the generated audio file.
40
  """
41
+ output_file = Path(__file__).parent / f"speech_{abs(hash(text))}.mp3"
42
+ try:
43
+ tts = gTTS(text=text, lang='en')
44
+ tts.save(str(output_file))
45
+ logging.info("gTTS succeeded.")
46
+ return str(output_file)
47
+ except Exception as e:
48
+ logging.error("gTTS failed. Error: %s", e)
49
+ raise