bluenevus commited on
Commit
ed4babd
·
verified ·
1 Parent(s): 873c9f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -150
app.py CHANGED
@@ -95,10 +95,16 @@ app.layout = dbc.Container([
95
  dcc.Store(id='generated-audio'),
96
  ])
97
 
98
- # Callbacks
99
  @callback(
100
  Output("script-output", "value"),
 
 
 
101
  Input("generate-script-btn", "n_clicks"),
 
 
 
102
  State("host1-name", "value"),
103
  State("host2-name", "value"),
104
  State("podcast-name", "value"),
@@ -107,92 +113,6 @@ app.layout = dbc.Container([
107
  State("upload-file", "contents"),
108
  State("duration", "value"),
109
  State("num-hosts", "value"),
110
- prevent_initial_call=True
111
- )
112
- def generate_podcast_script(n_clicks, host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts):
113
- if n_clicks is None:
114
- return ""
115
-
116
- try:
117
- # Get the Gemini API key from Hugging Face secrets
118
- api_key = os.environ.get("GEMINI_API_KEY")
119
- if not api_key:
120
- raise ValueError("Gemini API key not found in environment variables")
121
-
122
- genai.configure(api_key=api_key)
123
- model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
124
-
125
- combined_content = prompt or ""
126
-
127
- if uploaded_file:
128
- content_type, content_string = uploaded_file.split(',')
129
- decoded = base64.b64decode(content_string)
130
- file_bytes = io.BytesIO(decoded)
131
-
132
- # Try to detect the file type based on content
133
- file_bytes.seek(0)
134
- if file_bytes.read(4) == b'%PDF':
135
- # It's a PDF file
136
- file_bytes.seek(0)
137
- pdf_reader = PyPDF2.PdfReader(file_bytes)
138
- file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
139
- else:
140
- # Try as text file first
141
- file_bytes.seek(0)
142
- try:
143
- file_content = file_bytes.read().decode('utf-8')
144
- except UnicodeDecodeError:
145
- # If it's not a text file, try as a docx
146
- file_bytes.seek(0)
147
- try:
148
- doc = Document(file_bytes)
149
- file_content = "\n".join([para.text for para in doc.paragraphs])
150
- except:
151
- raise ValueError("Unsupported file type or corrupted file")
152
-
153
- combined_content += "\n" + file_content if combined_content else file_content
154
-
155
- num_hosts = int(num_hosts) if num_hosts else 1
156
-
157
- prompt_template = f"""
158
- Create a podcast script for {num_hosts} {'person' if num_hosts == 1 else 'people'} discussing:
159
- {combined_content}
160
-
161
- Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
162
- Use speech fillers like um, ah. Vary emotional tone.
163
-
164
- Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
165
- Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
166
- If the number of {num_hosts } is 1 then each paragraph will be no more than 3 sentences each
167
- Only provide the dialog for text to speech.
168
- Only use these emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
169
- -Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
170
- Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
171
- Do not include speaker labels like "jane:" or "john:" before dialogue.
172
- The intro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph.
173
- The outro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph
174
- Do not include these types of transitions in the intro, outro or between paragraphs for example: "Intro Music fades in...". Its just dialog.
175
- Keep each speaker's entire monologue in a single paragraph, regardless of length if the number of hosts is not 1.
176
- Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
177
- Maintain natural conversation flow and speech patterns within each monologue.
178
- Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
179
- Use speaker names ({host1_name} and/or {host2_name}) sparingly, only when necessary for clarity or emphasis. Avoid starting every line with the other person's name.
180
- Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
181
- Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
182
- {'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
183
- Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
184
- Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
185
- """
186
-
187
- response = model.generate_content(prompt_template)
188
- return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
189
- except Exception as e:
190
- logger.error(f"Error generating podcast script: {str(e)}")
191
- return f"Error: {str(e)}"
192
-
193
- @callback(
194
- Output("audio-output", "children"),
195
- Input("generate-audio-btn", "n_clicks"),
196
  State("script-output", "value"),
197
  State("voice1", "value"),
198
  State("voice2", "value"),
@@ -200,80 +120,153 @@ def generate_podcast_script(n_clicks, host1_name, host2_name, podcast_name, podc
200
  State("top-p", "value"),
201
  State("repetition-penalty", "value"),
202
  State("max-new-tokens", "value"),
203
- State("num-hosts", "value"),
204
  prevent_initial_call=True
205
  )
206
- @spaces.GPU()
207
- def generate_speech(n_clicks, text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts):
208
- if n_clicks is None or not text.strip():
209
- return html.Div("No audio generated yet.")
210
-
211
- try:
212
- paragraphs = text.split('\n\n') # Split by double newline
213
- audio_samples = []
214
-
215
- for i, paragraph in enumerate(paragraphs):
216
- if not paragraph.strip():
217
- continue
 
 
 
 
 
 
 
 
218
 
219
- voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
- input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
222
 
223
- with torch.no_grad():
224
- generated_ids = model.generate(
225
- input_ids,
226
- attention_mask=attention_mask,
227
- do_sample=True,
228
- temperature=temperature,
229
- top_p=top_p,
230
- repetition_penalty=repetition_penalty,
231
- max_new_tokens=max_new_tokens,
232
- num_return_sequences=1,
233
- eos_token_id=128258,
234
- )
235
 
236
- code_list = parse_output(generated_ids)
237
- paragraph_audio = redistribute_codes(code_list, snac_model)
238
 
239
- silences = detect_silence(paragraph_audio)
240
- if silences:
241
- paragraph_audio = paragraph_audio[:silences[-1][1]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
 
243
- audio_samples.append(paragraph_audio)
244
-
245
- final_audio = np.concatenate(audio_samples)
246
- final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
247
-
248
- # Convert to base64 for audio playback
249
- audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
250
- src = f"data:audio/wav;base64,{audio_base64}"
 
251
 
252
- return html.Audio(src=src, controls=True)
253
- except Exception as e:
254
- logger.error(f"Error generating speech: {str(e)}")
255
- return html.Div(f"Error generating audio: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
 
257
- @callback(
258
- Output("advanced-settings", "is_open"),
259
- Input("advanced-settings-toggle", "n_clicks"),
260
- State("advanced-settings", "is_open"),
261
- )
262
- def toggle_advanced_settings(n_clicks, is_open):
263
- if n_clicks:
264
- return not is_open
265
- return is_open
266
 
267
- @callback(
268
- Output("prompt", "value"),
269
- Output("script-output", "value"),
270
- Output("audio-output", "children"),
271
- Input("clear-btn", "n_clicks"),
272
- )
273
- def clear_outputs(n_clicks):
274
- if n_clicks:
275
- return "", "", html.Div("No audio generated yet.")
276
- return dash.no_update, dash.no_update, dash.no_update
277
 
278
  # Run the app
279
  if __name__ == '__main__':
 
95
  dcc.Store(id='generated-audio'),
96
  ])
97
 
98
+ # Combined callback
99
  @callback(
100
  Output("script-output", "value"),
101
+ Output("audio-output", "children"),
102
+ Output("advanced-settings", "is_open"),
103
+ Output("prompt", "value"),
104
  Input("generate-script-btn", "n_clicks"),
105
+ Input("generate-audio-btn", "n_clicks"),
106
+ Input("advanced-settings-toggle", "n_clicks"),
107
+ Input("clear-btn", "n_clicks"),
108
  State("host1-name", "value"),
109
  State("host2-name", "value"),
110
  State("podcast-name", "value"),
 
113
  State("upload-file", "contents"),
114
  State("duration", "value"),
115
  State("num-hosts", "value"),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  State("script-output", "value"),
117
  State("voice1", "value"),
118
  State("voice2", "value"),
 
120
  State("top-p", "value"),
121
  State("repetition-penalty", "value"),
122
  State("max-new-tokens", "value"),
123
+ State("advanced-settings", "is_open"),
124
  prevent_initial_call=True
125
  )
126
+ def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
127
+ host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
128
+ script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, is_advanced_open):
129
+ ctx = dash.callback_context
130
+ if not ctx.triggered:
131
+ return dash.no_update, dash.no_update, dash.no_update, dash.no_update
132
+
133
+ trigger_id = ctx.triggered[0]['prop_id'].split('.')[0]
134
+
135
+ if trigger_id == "generate-script-btn":
136
+ try:
137
+ # Get the Gemini API key from Hugging Face secrets
138
+ api_key = os.environ.get("GEMINI_API_KEY")
139
+ if not api_key:
140
+ raise ValueError("Gemini API key not found in environment variables")
141
+
142
+ genai.configure(api_key=api_key)
143
+ model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
144
+
145
+ combined_content = prompt or ""
146
 
147
+ if uploaded_file:
148
+ content_type, content_string = uploaded_file.split(',')
149
+ decoded = base64.b64decode(content_string)
150
+ file_bytes = io.BytesIO(decoded)
151
+
152
+ # Try to detect the file type based on content
153
+ file_bytes.seek(0)
154
+ if file_bytes.read(4) == b'%PDF':
155
+ # It's a PDF file
156
+ file_bytes.seek(0)
157
+ pdf_reader = PyPDF2.PdfReader(file_bytes)
158
+ file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
159
+ else:
160
+ # Try as text file first
161
+ file_bytes.seek(0)
162
+ try:
163
+ file_content = file_bytes.read().decode('utf-8')
164
+ except UnicodeDecodeError:
165
+ # If it's not a text file, try as a docx
166
+ file_bytes.seek(0)
167
+ try:
168
+ doc = Document(file_bytes)
169
+ file_content = "\n".join([para.text for para in doc.paragraphs])
170
+ except:
171
+ raise ValueError("Unsupported file type or corrupted file")
172
+
173
+ combined_content += "\n" + file_content if combined_content else file_content
174
 
175
+ num_hosts = int(num_hosts) if num_hosts else 1
176
 
177
+ prompt_template = f"""
178
+ Create a podcast script for {num_hosts} {'person' if num_hosts == 1 else 'people'} discussing:
179
+ {combined_content}
 
 
 
 
 
 
 
 
 
180
 
181
+ Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
182
+ Use speech fillers like um, ah. Vary emotional tone.
183
 
184
+ Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
185
+ Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
186
+ If the number of {num_hosts } is 1 then each paragraph will be no more than 3 sentences each
187
+ Only provide the dialog for text to speech.
188
+ Only use these emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
189
+ -Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
190
+ Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
191
+ Do not include speaker labels like "jane:" or "john:" before dialogue.
192
+ The intro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph.
193
+ The outro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph
194
+ Do not include these types of transitions in the intro, outro or between paragraphs for example: "Intro Music fades in...". Its just dialog.
195
+ Keep each speaker's entire monologue in a single paragraph, regardless of length if the number of hosts is not 1.
196
+ Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
197
+ Maintain natural conversation flow and speech patterns within each monologue.
198
+ Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
199
+ Use speaker names ({host1_name} and/or {host2_name}) sparingly, only when necessary for clarity or emphasis. Avoid starting every line with the other person's name.
200
+ Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
201
+ Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
202
+ {'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
203
+ Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
204
+ Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
205
+ """
206
 
207
+ response = model.generate_content(prompt_template)
208
+ return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text), dash.no_update, dash.no_update, dash.no_update
209
+ except Exception as e:
210
+ logger.error(f"Error generating podcast script: {str(e)}")
211
+ return f"Error: {str(e)}", dash.no_update, dash.no_update, dash.no_update
212
+
213
+ elif trigger_id == "generate-audio-btn":
214
+ if not script_output.strip():
215
+ return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update
216
 
217
+ try:
218
+ paragraphs = script_output.split('\n\n') # Split by double newline
219
+ audio_samples = []
220
+
221
+ for i, paragraph in enumerate(paragraphs):
222
+ if not paragraph.strip():
223
+ continue
224
+
225
+ voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
226
+
227
+ input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
228
+
229
+ with torch.no_grad():
230
+ generated_ids = model.generate(
231
+ input_ids,
232
+ attention_mask=attention_mask,
233
+ do_sample=True,
234
+ temperature=temperature,
235
+ top_p=top_p,
236
+ repetition_penalty=repetition_penalty,
237
+ max_new_tokens=max_new_tokens,
238
+ num_return_sequences=1,
239
+ eos_token_id=128258,
240
+ )
241
+
242
+ code_list = parse_output(generated_ids)
243
+ paragraph_audio = redistribute_codes(code_list, snac_model)
244
+
245
+ silences = detect_silence(paragraph_audio)
246
+ if silences:
247
+ paragraph_audio = paragraph_audio[:silences[-1][1]]
248
+
249
+ audio_samples.append(paragraph_audio)
250
+
251
+ final_audio = np.concatenate(audio_samples)
252
+ final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
253
+
254
+ # Convert to base64 for audio playback
255
+ audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
256
+ src = f"data:audio/wav;base64,{audio_base64}"
257
+
258
+ return dash.no_update, html.Audio(src=src, controls=True), dash.no_update, dash.no_update
259
+ except Exception as e:
260
+ logger.error(f"Error generating speech: {str(e)}")
261
+ return dash.no_update, html.Div(f"Error generating audio: {str(e)}"), dash.no_update, dash.no_update
262
 
263
+ elif trigger_id == "advanced-settings-toggle":
264
+ return dash.no_update, dash.no_update, not is_advanced_open, dash.no_update
 
 
 
 
 
 
 
265
 
266
+ elif trigger_id == "clear-btn":
267
+ return "", html.Div("No audio generated yet."), dash.no_update, ""
268
+
269
+ return dash.no_update, dash.no_update, dash.no_update, dash.no_update
 
 
 
 
 
 
270
 
271
  # Run the app
272
  if __name__ == '__main__':