Spaces:
Paused
Paused
TTS solved
Browse files
app.py
CHANGED
@@ -1,55 +1,64 @@
|
|
1 |
# app.py
|
2 |
import os
|
3 |
import gradio as gr
|
4 |
-
import logging
|
5 |
-
import tempfile
|
6 |
from gradio_pdf import PDF
|
7 |
-
|
8 |
from model import model_initialized
|
9 |
from pdf_processor import to_pdf, to_markdown
|
10 |
-
from
|
|
|
11 |
|
12 |
-
# Set up logging
|
13 |
-
logging.basicConfig(level=logging.INFO)
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
# Load header HTML content
|
16 |
-
|
17 |
-
header
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
-
# Define language options
|
20 |
-
latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr',
|
21 |
-
'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl',
|
22 |
-
'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv',
|
23 |
-
'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
|
24 |
arabic_lang = ['ar', 'fa', 'ug', 'ur']
|
25 |
-
cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava',
|
26 |
-
|
27 |
-
devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom',
|
28 |
-
'sa', 'bgc']
|
29 |
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
30 |
-
|
31 |
all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
|
32 |
|
33 |
-
# Define a function to convert a file to a PDF (if not already)
|
34 |
def file_to_pdf(file_obj):
|
35 |
if file_obj is not None:
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
37 |
return None
|
38 |
|
39 |
-
|
40 |
-
def read_text(text, language="en"):
|
41 |
"""
|
42 |
-
|
43 |
-
falling back to gTTS if an error occurs.
|
44 |
"""
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
# Set up the Gradio Blocks interface
|
53 |
with gr.Blocks() as demo:
|
54 |
gr.HTML(header)
|
55 |
with gr.Row():
|
@@ -86,11 +95,10 @@ with gr.Blocks() as demo:
|
|
86 |
md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
|
87 |
with gr.Tab("Markdown text"):
|
88 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
89 |
-
# TTS
|
90 |
-
|
91 |
-
|
92 |
|
93 |
-
# Define interactions
|
94 |
file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
|
95 |
|
96 |
convert_button.click(
|
@@ -99,7 +107,12 @@ with gr.Blocks() as demo:
|
|
99 |
outputs=[md_render, md_text, output_file, pdf_display]
|
100 |
)
|
101 |
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
103 |
|
104 |
clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
|
105 |
|
|
|
1 |
# app.py
|
2 |
import os
|
3 |
import gradio as gr
|
|
|
|
|
4 |
from gradio_pdf import PDF
|
5 |
+
import logging
|
6 |
from model import model_initialized
|
7 |
from pdf_processor import to_pdf, to_markdown
|
8 |
+
from config import config
|
9 |
+
from tts import text_to_speech # Import TTS module
|
10 |
|
11 |
+
# Set up logging with ANSI escape codes for colored output
|
12 |
+
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")
|
13 |
+
|
14 |
+
def log_info(message: str):
|
15 |
+
logging.info(f"\033[92m{message}\033[0m") # Green for info
|
16 |
+
|
17 |
+
def log_error(message: str):
|
18 |
+
logging.error(f"\033[91m{message}\033[0m") # Red for errors
|
19 |
|
20 |
# Load header HTML content
|
21 |
+
try:
|
22 |
+
with open("header.html", "r") as file:
|
23 |
+
header = file.read()
|
24 |
+
log_info("Header loaded successfully.")
|
25 |
+
except Exception as e:
|
26 |
+
log_error(f"Failed to load header.html. Error: {e}")
|
27 |
+
header = "<h1>Header not found</h1>"
|
28 |
|
29 |
+
# Define language options
|
30 |
+
latin_lang = ['af', 'az', 'bs', 'cs', 'cy', 'da', 'de', 'es', 'et', 'fr', 'ga', 'hr', 'hu', 'id', 'is', 'it', 'ku', 'la', 'lt', 'lv', 'mi', 'ms', 'mt', 'nl', 'no', 'oc', 'pi', 'pl', 'pt', 'ro', 'rs_latin', 'sk', 'sl', 'sq', 'sv', 'sw', 'tl', 'tr', 'uz', 'vi', 'french', 'german']
|
|
|
|
|
|
|
31 |
arabic_lang = ['ar', 'fa', 'ug', 'ur']
|
32 |
+
cyrillic_lang = ['ru', 'rs_cyrillic', 'be', 'bg', 'uk', 'mn', 'abq', 'ady', 'kbd', 'ava', 'dar', 'inh', 'che', 'lbe', 'lez', 'tab']
|
33 |
+
devanagari_lang = ['hi', 'mr', 'ne', 'bh', 'mai', 'ang', 'bho', 'mah', 'sck', 'new', 'gom', 'sa', 'bgc']
|
|
|
|
|
34 |
other_lang = ['ch', 'en', 'korean', 'japan', 'chinese_cht', 'ta', 'te', 'ka']
|
|
|
35 |
all_lang = ['', 'auto'] + other_lang + latin_lang + arabic_lang + cyrillic_lang + devanagari_lang
|
36 |
|
|
|
37 |
def file_to_pdf(file_obj):
|
38 |
if file_obj is not None:
|
39 |
+
try:
|
40 |
+
pdf_path = to_pdf(file_obj.name)
|
41 |
+
log_info("File converted to PDF successfully.")
|
42 |
+
return pdf_path
|
43 |
+
except Exception as e:
|
44 |
+
log_error(f"Error converting file to PDF: {e}")
|
45 |
return None
|
46 |
|
47 |
+
def generate_audio(text: str) -> str:
|
|
|
48 |
"""
|
49 |
+
Converts the provided text to speech and returns the path of the audio file.
|
|
|
50 |
"""
|
51 |
+
if text:
|
52 |
+
try:
|
53 |
+
audio_file = text_to_speech(text)
|
54 |
+
log_info("Audio generated successfully.")
|
55 |
+
return audio_file
|
56 |
+
except Exception as e:
|
57 |
+
log_error(f"Audio generation failed: {e}")
|
58 |
+
return ""
|
59 |
+
log_error("No text provided for TTS.")
|
60 |
+
return ""
|
61 |
|
|
|
62 |
with gr.Blocks() as demo:
|
63 |
gr.HTML(header)
|
64 |
with gr.Row():
|
|
|
95 |
md_render = gr.Markdown(label="Markdown rendering", height=1100, show_copy_button=True, line_breaks=True)
|
96 |
with gr.Tab("Markdown text"):
|
97 |
md_text = gr.TextArea(lines=45, show_copy_button=True)
|
98 |
+
# Audio component for TTS playback
|
99 |
+
audio_output = gr.Audio(label="Read Aloud", type="filepath")
|
100 |
+
read_button = gr.Button("Read Aloud")
|
101 |
|
|
|
102 |
file_input.change(fn=file_to_pdf, inputs=file_input, outputs=pdf_display)
|
103 |
|
104 |
convert_button.click(
|
|
|
107 |
outputs=[md_render, md_text, output_file, pdf_display]
|
108 |
)
|
109 |
|
110 |
+
# When "Read Aloud" is clicked, generate audio from the markdown text
|
111 |
+
read_button.click(
|
112 |
+
fn=generate_audio,
|
113 |
+
inputs=md_text,
|
114 |
+
outputs=audio_output
|
115 |
+
)
|
116 |
|
117 |
clear_button.add([file_input, md_render, pdf_display, md_text, output_file, is_ocr])
|
118 |
|
tts.py
CHANGED
@@ -1,46 +1,49 @@
|
|
1 |
# tts.py
|
2 |
import os
|
3 |
-
import
|
4 |
-
import
|
5 |
-
|
|
|
6 |
|
7 |
-
|
|
|
|
|
|
|
8 |
"""
|
9 |
-
Convert text to speech using
|
10 |
-
|
11 |
-
|
|
|
|
|
12 |
"""
|
13 |
-
|
14 |
-
|
15 |
-
if not api_key:
|
16 |
-
raise ValueError("API key for OpenAI TTS not found in environment variable 'api_key_oai'")
|
17 |
-
openai.api_key = api_key
|
18 |
-
|
19 |
try:
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
language=language
|
25 |
)
|
26 |
-
|
|
|
|
|
27 |
except Exception as e:
|
28 |
-
|
29 |
-
|
30 |
-
# Download and play the audio
|
31 |
-
audio_data = requests.get(audio_url).content
|
32 |
-
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
|
33 |
-
tmp_file.write(audio_data)
|
34 |
-
tmp_file_path = tmp_file.name
|
35 |
-
playsound(tmp_file_path)
|
36 |
|
37 |
-
def text_to_speech_gtts(text
|
38 |
"""
|
39 |
-
|
|
|
|
|
|
|
40 |
"""
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
tts.save(
|
45 |
-
|
46 |
-
|
|
|
|
|
|
|
|
1 |
# tts.py
|
2 |
import os
|
3 |
+
from pathlib import Path
|
4 |
+
import openai
|
5 |
+
import logging
|
6 |
+
from gtts import gTTS # Ensure gTTS is installed (pip install gTTS)
|
7 |
|
8 |
+
# Set OpenAI API key from the environment variable
|
9 |
+
openai.api_key = os.getenv("api_key_oai")
|
10 |
+
|
11 |
+
def text_to_speech(text: str, voice: str = "coral", model: str = "tts-1") -> str:
|
12 |
"""
|
13 |
+
Convert input text to speech using OpenAI's TTS API.
|
14 |
+
Falls back to gTTS if the OpenAI API fails.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
The file path to the generated audio file.
|
18 |
"""
|
19 |
+
# Generate a unique filename using a hash of the text
|
20 |
+
output_file = Path(__file__).parent / f"speech_{abs(hash(text))}.pus"
|
|
|
|
|
|
|
|
|
21 |
try:
|
22 |
+
response = openai.Audio.speech.create(
|
23 |
+
model=model,
|
24 |
+
voice=voice,
|
25 |
+
input=text,
|
|
|
26 |
)
|
27 |
+
response.stream_to_file(str(output_file))
|
28 |
+
logging.info("OpenAI TTS succeeded.")
|
29 |
+
return str(output_file)
|
30 |
except Exception as e:
|
31 |
+
logging.error("OpenAI TTS failed, falling back to gTTS. Error: %s", e)
|
32 |
+
return text_to_speech_gtts(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
+
def text_to_speech_gtts(text: str) -> str:
|
35 |
"""
|
36 |
+
Convert input text to speech using gTTS.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
The file path to the generated audio file.
|
40 |
"""
|
41 |
+
output_file = Path(__file__).parent / f"speech_{abs(hash(text))}.mp3"
|
42 |
+
try:
|
43 |
+
tts = gTTS(text=text, lang='en')
|
44 |
+
tts.save(str(output_file))
|
45 |
+
logging.info("gTTS succeeded.")
|
46 |
+
return str(output_file)
|
47 |
+
except Exception as e:
|
48 |
+
logging.error("gTTS failed. Error: %s", e)
|
49 |
+
raise
|