Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -95,10 +95,16 @@ app.layout = dbc.Container([
|
|
| 95 |
dcc.Store(id='generated-audio'),
|
| 96 |
])
|
| 97 |
|
| 98 |
-
#
|
| 99 |
@callback(
|
| 100 |
Output("script-output", "value"),
|
|
|
|
|
|
|
|
|
|
| 101 |
Input("generate-script-btn", "n_clicks"),
|
|
|
|
|
|
|
|
|
|
| 102 |
State("host1-name", "value"),
|
| 103 |
State("host2-name", "value"),
|
| 104 |
State("podcast-name", "value"),
|
|
@@ -107,92 +113,6 @@ app.layout = dbc.Container([
|
|
| 107 |
State("upload-file", "contents"),
|
| 108 |
State("duration", "value"),
|
| 109 |
State("num-hosts", "value"),
|
| 110 |
-
prevent_initial_call=True
|
| 111 |
-
)
|
| 112 |
-
def generate_podcast_script(n_clicks, host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts):
|
| 113 |
-
if n_clicks is None:
|
| 114 |
-
return ""
|
| 115 |
-
|
| 116 |
-
try:
|
| 117 |
-
# Get the Gemini API key from Hugging Face secrets
|
| 118 |
-
api_key = os.environ.get("GEMINI_API_KEY")
|
| 119 |
-
if not api_key:
|
| 120 |
-
raise ValueError("Gemini API key not found in environment variables")
|
| 121 |
-
|
| 122 |
-
genai.configure(api_key=api_key)
|
| 123 |
-
model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
|
| 124 |
-
|
| 125 |
-
combined_content = prompt or ""
|
| 126 |
-
|
| 127 |
-
if uploaded_file:
|
| 128 |
-
content_type, content_string = uploaded_file.split(',')
|
| 129 |
-
decoded = base64.b64decode(content_string)
|
| 130 |
-
file_bytes = io.BytesIO(decoded)
|
| 131 |
-
|
| 132 |
-
# Try to detect the file type based on content
|
| 133 |
-
file_bytes.seek(0)
|
| 134 |
-
if file_bytes.read(4) == b'%PDF':
|
| 135 |
-
# It's a PDF file
|
| 136 |
-
file_bytes.seek(0)
|
| 137 |
-
pdf_reader = PyPDF2.PdfReader(file_bytes)
|
| 138 |
-
file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
|
| 139 |
-
else:
|
| 140 |
-
# Try as text file first
|
| 141 |
-
file_bytes.seek(0)
|
| 142 |
-
try:
|
| 143 |
-
file_content = file_bytes.read().decode('utf-8')
|
| 144 |
-
except UnicodeDecodeError:
|
| 145 |
-
# If it's not a text file, try as a docx
|
| 146 |
-
file_bytes.seek(0)
|
| 147 |
-
try:
|
| 148 |
-
doc = Document(file_bytes)
|
| 149 |
-
file_content = "\n".join([para.text for para in doc.paragraphs])
|
| 150 |
-
except:
|
| 151 |
-
raise ValueError("Unsupported file type or corrupted file")
|
| 152 |
-
|
| 153 |
-
combined_content += "\n" + file_content if combined_content else file_content
|
| 154 |
-
|
| 155 |
-
num_hosts = int(num_hosts) if num_hosts else 1
|
| 156 |
-
|
| 157 |
-
prompt_template = f"""
|
| 158 |
-
Create a podcast script for {num_hosts} {'person' if num_hosts == 1 else 'people'} discussing:
|
| 159 |
-
{combined_content}
|
| 160 |
-
|
| 161 |
-
Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
|
| 162 |
-
Use speech fillers like um, ah. Vary emotional tone.
|
| 163 |
-
|
| 164 |
-
Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
|
| 165 |
-
Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
|
| 166 |
-
If the number of {num_hosts } is 1 then each paragraph will be no more than 3 sentences each
|
| 167 |
-
Only provide the dialog for text to speech.
|
| 168 |
-
Only use these emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
|
| 169 |
-
-Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
|
| 170 |
-
Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
|
| 171 |
-
Do not include speaker labels like "jane:" or "john:" before dialogue.
|
| 172 |
-
The intro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph.
|
| 173 |
-
The outro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph
|
| 174 |
-
Do not include these types of transitions in the intro, outro or between paragraphs for example: "Intro Music fades in...". Its just dialog.
|
| 175 |
-
Keep each speaker's entire monologue in a single paragraph, regardless of length if the number of hosts is not 1.
|
| 176 |
-
Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
|
| 177 |
-
Maintain natural conversation flow and speech patterns within each monologue.
|
| 178 |
-
Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
|
| 179 |
-
Use speaker names ({host1_name} and/or {host2_name}) sparingly, only when necessary for clarity or emphasis. Avoid starting every line with the other person's name.
|
| 180 |
-
Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
|
| 181 |
-
Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
|
| 182 |
-
{'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
|
| 183 |
-
Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
|
| 184 |
-
Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
|
| 185 |
-
"""
|
| 186 |
-
|
| 187 |
-
response = model.generate_content(prompt_template)
|
| 188 |
-
return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
|
| 189 |
-
except Exception as e:
|
| 190 |
-
logger.error(f"Error generating podcast script: {str(e)}")
|
| 191 |
-
return f"Error: {str(e)}"
|
| 192 |
-
|
| 193 |
-
@callback(
|
| 194 |
-
Output("audio-output", "children"),
|
| 195 |
-
Input("generate-audio-btn", "n_clicks"),
|
| 196 |
State("script-output", "value"),
|
| 197 |
State("voice1", "value"),
|
| 198 |
State("voice2", "value"),
|
|
@@ -200,80 +120,153 @@ def generate_podcast_script(n_clicks, host1_name, host2_name, podcast_name, podc
|
|
| 200 |
State("top-p", "value"),
|
| 201 |
State("repetition-penalty", "value"),
|
| 202 |
State("max-new-tokens", "value"),
|
| 203 |
-
State("
|
| 204 |
prevent_initial_call=True
|
| 205 |
)
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 218 |
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
-
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
attention_mask=attention_mask,
|
| 227 |
-
do_sample=True,
|
| 228 |
-
temperature=temperature,
|
| 229 |
-
top_p=top_p,
|
| 230 |
-
repetition_penalty=repetition_penalty,
|
| 231 |
-
max_new_tokens=max_new_tokens,
|
| 232 |
-
num_return_sequences=1,
|
| 233 |
-
eos_token_id=128258,
|
| 234 |
-
)
|
| 235 |
|
| 236 |
-
|
| 237 |
-
|
| 238 |
|
| 239 |
-
|
| 240 |
-
if
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
|
|
|
| 251 |
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
Input("advanced-settings-toggle", "n_clicks"),
|
| 260 |
-
State("advanced-settings", "is_open"),
|
| 261 |
-
)
|
| 262 |
-
def toggle_advanced_settings(n_clicks, is_open):
|
| 263 |
-
if n_clicks:
|
| 264 |
-
return not is_open
|
| 265 |
-
return is_open
|
| 266 |
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
Input("clear-btn", "n_clicks"),
|
| 272 |
-
)
|
| 273 |
-
def clear_outputs(n_clicks):
|
| 274 |
-
if n_clicks:
|
| 275 |
-
return "", "", html.Div("No audio generated yet.")
|
| 276 |
-
return dash.no_update, dash.no_update, dash.no_update
|
| 277 |
|
| 278 |
# Run the app
|
| 279 |
if __name__ == '__main__':
|
|
|
|
| 95 |
dcc.Store(id='generated-audio'),
|
| 96 |
])
|
| 97 |
|
| 98 |
+
# Combined callback
|
| 99 |
@callback(
|
| 100 |
Output("script-output", "value"),
|
| 101 |
+
Output("audio-output", "children"),
|
| 102 |
+
Output("advanced-settings", "is_open"),
|
| 103 |
+
Output("prompt", "value"),
|
| 104 |
Input("generate-script-btn", "n_clicks"),
|
| 105 |
+
Input("generate-audio-btn", "n_clicks"),
|
| 106 |
+
Input("advanced-settings-toggle", "n_clicks"),
|
| 107 |
+
Input("clear-btn", "n_clicks"),
|
| 108 |
State("host1-name", "value"),
|
| 109 |
State("host2-name", "value"),
|
| 110 |
State("podcast-name", "value"),
|
|
|
|
| 113 |
State("upload-file", "contents"),
|
| 114 |
State("duration", "value"),
|
| 115 |
State("num-hosts", "value"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
State("script-output", "value"),
|
| 117 |
State("voice1", "value"),
|
| 118 |
State("voice2", "value"),
|
|
|
|
| 120 |
State("top-p", "value"),
|
| 121 |
State("repetition-penalty", "value"),
|
| 122 |
State("max-new-tokens", "value"),
|
| 123 |
+
State("advanced-settings", "is_open"),
|
| 124 |
prevent_initial_call=True
|
| 125 |
)
|
| 126 |
+
def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
|
| 127 |
+
host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
|
| 128 |
+
script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, is_advanced_open):
|
| 129 |
+
ctx = dash.callback_context
|
| 130 |
+
if not ctx.triggered:
|
| 131 |
+
return dash.no_update, dash.no_update, dash.no_update, dash.no_update
|
| 132 |
+
|
| 133 |
+
trigger_id = ctx.triggered[0]['prop_id'].split('.')[0]
|
| 134 |
+
|
| 135 |
+
if trigger_id == "generate-script-btn":
|
| 136 |
+
try:
|
| 137 |
+
# Get the Gemini API key from Hugging Face secrets
|
| 138 |
+
api_key = os.environ.get("GEMINI_API_KEY")
|
| 139 |
+
if not api_key:
|
| 140 |
+
raise ValueError("Gemini API key not found in environment variables")
|
| 141 |
+
|
| 142 |
+
genai.configure(api_key=api_key)
|
| 143 |
+
model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
|
| 144 |
+
|
| 145 |
+
combined_content = prompt or ""
|
| 146 |
|
| 147 |
+
if uploaded_file:
|
| 148 |
+
content_type, content_string = uploaded_file.split(',')
|
| 149 |
+
decoded = base64.b64decode(content_string)
|
| 150 |
+
file_bytes = io.BytesIO(decoded)
|
| 151 |
+
|
| 152 |
+
# Try to detect the file type based on content
|
| 153 |
+
file_bytes.seek(0)
|
| 154 |
+
if file_bytes.read(4) == b'%PDF':
|
| 155 |
+
# It's a PDF file
|
| 156 |
+
file_bytes.seek(0)
|
| 157 |
+
pdf_reader = PyPDF2.PdfReader(file_bytes)
|
| 158 |
+
file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
|
| 159 |
+
else:
|
| 160 |
+
# Try as text file first
|
| 161 |
+
file_bytes.seek(0)
|
| 162 |
+
try:
|
| 163 |
+
file_content = file_bytes.read().decode('utf-8')
|
| 164 |
+
except UnicodeDecodeError:
|
| 165 |
+
# If it's not a text file, try as a docx
|
| 166 |
+
file_bytes.seek(0)
|
| 167 |
+
try:
|
| 168 |
+
doc = Document(file_bytes)
|
| 169 |
+
file_content = "\n".join([para.text for para in doc.paragraphs])
|
| 170 |
+
except:
|
| 171 |
+
raise ValueError("Unsupported file type or corrupted file")
|
| 172 |
+
|
| 173 |
+
combined_content += "\n" + file_content if combined_content else file_content
|
| 174 |
|
| 175 |
+
num_hosts = int(num_hosts) if num_hosts else 1
|
| 176 |
|
| 177 |
+
prompt_template = f"""
|
| 178 |
+
Create a podcast script for {num_hosts} {'person' if num_hosts == 1 else 'people'} discussing:
|
| 179 |
+
{combined_content}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
|
| 182 |
+
Use speech fillers like um, ah. Vary emotional tone.
|
| 183 |
|
| 184 |
+
Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
|
| 185 |
+
Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
|
| 186 |
+
If the number of {num_hosts } is 1 then each paragraph will be no more than 3 sentences each
|
| 187 |
+
Only provide the dialog for text to speech.
|
| 188 |
+
Only use these emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
|
| 189 |
+
-Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
|
| 190 |
+
Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
|
| 191 |
+
Do not include speaker labels like "jane:" or "john:" before dialogue.
|
| 192 |
+
The intro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph.
|
| 193 |
+
The outro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph
|
| 194 |
+
Do not include these types of transitions in the intro, outro or between paragraphs for example: "Intro Music fades in...". Its just dialog.
|
| 195 |
+
Keep each speaker's entire monologue in a single paragraph, regardless of length if the number of hosts is not 1.
|
| 196 |
+
Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
|
| 197 |
+
Maintain natural conversation flow and speech patterns within each monologue.
|
| 198 |
+
Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
|
| 199 |
+
Use speaker names ({host1_name} and/or {host2_name}) sparingly, only when necessary for clarity or emphasis. Avoid starting every line with the other person's name.
|
| 200 |
+
Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
|
| 201 |
+
Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
|
| 202 |
+
{'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
|
| 203 |
+
Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
|
| 204 |
+
Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
|
| 205 |
+
"""
|
| 206 |
|
| 207 |
+
response = model.generate_content(prompt_template)
|
| 208 |
+
return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text), dash.no_update, dash.no_update, dash.no_update
|
| 209 |
+
except Exception as e:
|
| 210 |
+
logger.error(f"Error generating podcast script: {str(e)}")
|
| 211 |
+
return f"Error: {str(e)}", dash.no_update, dash.no_update, dash.no_update
|
| 212 |
+
|
| 213 |
+
elif trigger_id == "generate-audio-btn":
|
| 214 |
+
if not script_output.strip():
|
| 215 |
+
return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update
|
| 216 |
|
| 217 |
+
try:
|
| 218 |
+
paragraphs = script_output.split('\n\n') # Split by double newline
|
| 219 |
+
audio_samples = []
|
| 220 |
+
|
| 221 |
+
for i, paragraph in enumerate(paragraphs):
|
| 222 |
+
if not paragraph.strip():
|
| 223 |
+
continue
|
| 224 |
+
|
| 225 |
+
voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
|
| 226 |
+
|
| 227 |
+
input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
|
| 228 |
+
|
| 229 |
+
with torch.no_grad():
|
| 230 |
+
generated_ids = model.generate(
|
| 231 |
+
input_ids,
|
| 232 |
+
attention_mask=attention_mask,
|
| 233 |
+
do_sample=True,
|
| 234 |
+
temperature=temperature,
|
| 235 |
+
top_p=top_p,
|
| 236 |
+
repetition_penalty=repetition_penalty,
|
| 237 |
+
max_new_tokens=max_new_tokens,
|
| 238 |
+
num_return_sequences=1,
|
| 239 |
+
eos_token_id=128258,
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
code_list = parse_output(generated_ids)
|
| 243 |
+
paragraph_audio = redistribute_codes(code_list, snac_model)
|
| 244 |
+
|
| 245 |
+
silences = detect_silence(paragraph_audio)
|
| 246 |
+
if silences:
|
| 247 |
+
paragraph_audio = paragraph_audio[:silences[-1][1]]
|
| 248 |
+
|
| 249 |
+
audio_samples.append(paragraph_audio)
|
| 250 |
+
|
| 251 |
+
final_audio = np.concatenate(audio_samples)
|
| 252 |
+
final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
|
| 253 |
+
|
| 254 |
+
# Convert to base64 for audio playback
|
| 255 |
+
audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
|
| 256 |
+
src = f"data:audio/wav;base64,{audio_base64}"
|
| 257 |
+
|
| 258 |
+
return dash.no_update, html.Audio(src=src, controls=True), dash.no_update, dash.no_update
|
| 259 |
+
except Exception as e:
|
| 260 |
+
logger.error(f"Error generating speech: {str(e)}")
|
| 261 |
+
return dash.no_update, html.Div(f"Error generating audio: {str(e)}"), dash.no_update, dash.no_update
|
| 262 |
|
| 263 |
+
elif trigger_id == "advanced-settings-toggle":
|
| 264 |
+
return dash.no_update, dash.no_update, not is_advanced_open, dash.no_update
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
+
elif trigger_id == "clear-btn":
|
| 267 |
+
return "", html.Div("No audio generated yet."), dash.no_update, ""
|
| 268 |
+
|
| 269 |
+
return dash.no_update, dash.no_update, dash.no_update, dash.no_update
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
|
| 271 |
# Run the app
|
| 272 |
if __name__ == '__main__':
|