podcast-generator

Sleeping

App Files Files Community

bluenevus commited on Apr 25

Commit

ed4babd

verified ·

1 Parent(s): 873c9f4

Update app.py

Browse files

Files changed (1) hide show

app.py +143 -150

app.py CHANGED Viewed

@@ -95,10 +95,16 @@ app.layout = dbc.Container([
     dcc.Store(id='generated-audio'),
 ])
-# Callbacks
 @callback(
     Output("script-output", "value"),
     Input("generate-script-btn", "n_clicks"),
     State("host1-name", "value"),
     State("host2-name", "value"),
     State("podcast-name", "value"),
@@ -107,92 +113,6 @@ app.layout = dbc.Container([
     State("upload-file", "contents"),
     State("duration", "value"),
     State("num-hosts", "value"),
-    prevent_initial_call=True
-)
-def generate_podcast_script(n_clicks, host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts):
-    if n_clicks is None:
-        return ""
-    try:
-        # Get the Gemini API key from Hugging Face secrets
-        api_key = os.environ.get("GEMINI_API_KEY")
-        if not api_key:
-            raise ValueError("Gemini API key not found in environment variables")
-        genai.configure(api_key=api_key)
-        model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
-        combined_content = prompt or ""
-        if uploaded_file:
-            content_type, content_string = uploaded_file.split(',')
-            decoded = base64.b64decode(content_string)
-            file_bytes = io.BytesIO(decoded)
-            # Try to detect the file type based on content
-            file_bytes.seek(0)
-            if file_bytes.read(4) == b'%PDF':
-                # It's a PDF file
-                file_bytes.seek(0)
-                pdf_reader = PyPDF2.PdfReader(file_bytes)
-                file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
-            else:
-                # Try as text file first
-                file_bytes.seek(0)
-                try:
-                    file_content = file_bytes.read().decode('utf-8')
-                except UnicodeDecodeError:
-                    # If it's not a text file, try as a docx
-                    file_bytes.seek(0)
-                    try:
-                        doc = Document(file_bytes)
-                        file_content = "\n".join([para.text for para in doc.paragraphs])
-                    except:
-                        raise ValueError("Unsupported file type or corrupted file")
-            combined_content += "\n" + file_content if combined_content else file_content
-        num_hosts = int(num_hosts) if num_hosts else 1
-        prompt_template = f"""
-        Create a podcast script for {num_hosts} {'person' if num_hosts == 1 else 'people'} discussing:
-        {combined_content}
-        Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
-        Use speech fillers like um, ah. Vary emotional tone.
-        Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
-        Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
-        If the number of {num_hosts } is 1 then each paragraph will be no more than 3 sentences each
-        Only provide the dialog for text to speech.
-        Only use these emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
-        -Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
-        Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
-        Do not include speaker labels like "jane:" or "john:" before dialogue.
-        The intro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph.
-        The outro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph
-        Do not include these types of transitions in the intro, outro or between paragraphs for example: "Intro Music fades in...".  Its just dialog.
-        Keep each speaker's entire monologue in a single paragraph, regardless of length if the number of hosts is not 1.
-        Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
-        Maintain natural conversation flow and speech patterns within each monologue.
-        Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
-        Use speaker names ({host1_name} and/or {host2_name}) sparingly, only when necessary for clarity or emphasis. Avoid starting every line with the other person's name.
-        Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
-        Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
-        {'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
-        Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
-        Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
-        """
-        response = model.generate_content(prompt_template)
-        return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text)
-    except Exception as e:
-        logger.error(f"Error generating podcast script: {str(e)}")
-        return f"Error: {str(e)}"
-@callback(
-    Output("audio-output", "children"),
-    Input("generate-audio-btn", "n_clicks"),
     State("script-output", "value"),
     State("voice1", "value"),
     State("voice2", "value"),
@@ -200,80 +120,153 @@ def generate_podcast_script(n_clicks, host1_name, host2_name, podcast_name, podc
     State("top-p", "value"),
     State("repetition-penalty", "value"),
     State("max-new-tokens", "value"),
-    State("num-hosts", "value"),
     prevent_initial_call=True
 )
-@spaces.GPU()
-def generate_speech(n_clicks, text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts):
-    if n_clicks is None or not text.strip():
-        return html.Div("No audio generated yet.")
-    try:
-        paragraphs = text.split('\n\n')  # Split by double newline
-        audio_samples = []
-        for i, paragraph in enumerate(paragraphs):
-            if not paragraph.strip():
-                continue
-            voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
-            input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
-            with torch.no_grad():
-                generated_ids = model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    do_sample=True,
-                    temperature=temperature,
-                    top_p=top_p,
-                    repetition_penalty=repetition_penalty,
-                    max_new_tokens=max_new_tokens,
-                    num_return_sequences=1,
-                    eos_token_id=128258,
-                )
-            code_list = parse_output(generated_ids)
-            paragraph_audio = redistribute_codes(code_list, snac_model)
-            silences = detect_silence(paragraph_audio)
-            if silences:
-                paragraph_audio = paragraph_audio[:silences[-1][1]]
-            audio_samples.append(paragraph_audio)
-        final_audio = np.concatenate(audio_samples)
-        final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
-        # Convert to base64 for audio playback
-        audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
-        src = f"data:audio/wav;base64,{audio_base64}"
-        return html.Audio(src=src, controls=True)
-    except Exception as e:
-        logger.error(f"Error generating speech: {str(e)}")
-        return html.Div(f"Error generating audio: {str(e)}")
-@callback(
-    Output("advanced-settings", "is_open"),
-    Input("advanced-settings-toggle", "n_clicks"),
-    State("advanced-settings", "is_open"),
-)
-def toggle_advanced_settings(n_clicks, is_open):
-    if n_clicks:
-        return not is_open
-    return is_open
-@callback(
-    Output("prompt", "value"),
-    Output("script-output", "value"),
-    Output("audio-output", "children"),
-    Input("clear-btn", "n_clicks"),
-)
-def clear_outputs(n_clicks):
-    if n_clicks:
-        return "", "", html.Div("No audio generated yet.")
-    return dash.no_update, dash.no_update, dash.no_update
 # Run the app
 if __name__ == '__main__':

     dcc.Store(id='generated-audio'),
 ])
+# Combined callback
 @callback(
     Output("script-output", "value"),
+    Output("audio-output", "children"),
+    Output("advanced-settings", "is_open"),
+    Output("prompt", "value"),
     Input("generate-script-btn", "n_clicks"),
+    Input("generate-audio-btn", "n_clicks"),
+    Input("advanced-settings-toggle", "n_clicks"),
+    Input("clear-btn", "n_clicks"),
     State("host1-name", "value"),
     State("host2-name", "value"),
     State("podcast-name", "value"),
     State("upload-file", "contents"),
     State("duration", "value"),
     State("num-hosts", "value"),
     State("script-output", "value"),
     State("voice1", "value"),
     State("voice2", "value"),
     State("top-p", "value"),
     State("repetition-penalty", "value"),
     State("max-new-tokens", "value"),
+    State("advanced-settings", "is_open"),
     prevent_initial_call=True
 )
+def combined_callback(generate_script_clicks, generate_audio_clicks, advanced_settings_clicks, clear_clicks,
+                      host1_name, host2_name, podcast_name, podcast_topic, prompt, uploaded_file, duration, num_hosts,
+                      script_output, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, is_advanced_open):
+    ctx = dash.callback_context
+    if not ctx.triggered:
+        return dash.no_update, dash.no_update, dash.no_update, dash.no_update
+    trigger_id = ctx.triggered[0]['prop_id'].split('.')[0]
+    if trigger_id == "generate-script-btn":
+        try:
+            # Get the Gemini API key from Hugging Face secrets
+            api_key = os.environ.get("GEMINI_API_KEY")
+            if not api_key:
+                raise ValueError("Gemini API key not found in environment variables")
+            genai.configure(api_key=api_key)
+            model = genai.GenerativeModel('gemini-2.5-pro-preview-03-25')
+            combined_content = prompt or ""
+            if uploaded_file:
+                content_type, content_string = uploaded_file.split(',')
+                decoded = base64.b64decode(content_string)
+                file_bytes = io.BytesIO(decoded)
+                # Try to detect the file type based on content
+                file_bytes.seek(0)
+                if file_bytes.read(4) == b'%PDF':
+                    # It's a PDF file
+                    file_bytes.seek(0)
+                    pdf_reader = PyPDF2.PdfReader(file_bytes)
+                    file_content = "\n".join([page.extract_text() for page in pdf_reader.pages])
+                else:
+                    # Try as text file first
+                    file_bytes.seek(0)
+                    try:
+                        file_content = file_bytes.read().decode('utf-8')
+                    except UnicodeDecodeError:
+                        # If it's not a text file, try as a docx
+                        file_bytes.seek(0)
+                        try:
+                            doc = Document(file_bytes)
+                            file_content = "\n".join([para.text for para in doc.paragraphs])
+                        except:
+                            raise ValueError("Unsupported file type or corrupted file")
+                combined_content += "\n" + file_content if combined_content else file_content
+            num_hosts = int(num_hosts) if num_hosts else 1
+            prompt_template = f"""
+            Create a podcast script for {num_hosts} {'person' if num_hosts == 1 else 'people'} discussing:
+            {combined_content}
+            Duration: {duration} minutes. Include natural speech, humor, and occasional off-topic thoughts.
+            Use speech fillers like um, ah. Vary emotional tone.
+            Format: {'Monologue' if num_hosts == 1 else 'Alternating dialogue'} without speaker labels.
+            Separate {'paragraphs' if num_hosts == 1 else 'lines'} with blank lines.
+            If the number of {num_hosts } is 1 then each paragraph will be no more than 3 sentences each
+            Only provide the dialog for text to speech.
+            Only use these emotion tags in angle brackets: <laugh>, <sigh>, <chuckle>, <cough>, <sniffle>, <groan>, <yawn>, <gasp>.
+            -Example: "I can't believe I stayed up all night <yawn> only to find out the meeting was canceled <groan>."
+            Ensure content flows naturally and stays on topic. Match the script length to {duration} minutes.
+            Do not include speaker labels like "jane:" or "john:" before dialogue.
+            The intro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph.
+            The outro always includes the ({host1_name} and/or {host2_name}) if it exists and should be in the same paragraph
+            Do not include these types of transitions in the intro, outro or between paragraphs for example: "Intro Music fades in...".  Its just dialog.
+            Keep each speaker's entire monologue in a single paragraph, regardless of length if the number of hosts is not 1.
+            Start a new paragraph only when switching to a different speaker if the number of hosts is not 1.
+            Maintain natural conversation flow and speech patterns within each monologue.
+            Use context clues or subtle references to indicate who is speaking without explicit labels if the number of hosts is not 1.
+            Use speaker names ({host1_name} and/or {host2_name}) sparingly, only when necessary for clarity or emphasis. Avoid starting every line with the other person's name.
+            Rely more on context and speech patterns to indicate who is speaking, rather than always stating names.
+            Use names primarily for transitions sparingly, definitely with agreements, or to draw attention to a specific point, not as a constant form of address.
+            {'Make sure the script is a monologue for one person.' if num_hosts == 1 else f'Ensure the dialogue alternates between two distinct voices, with {host1_name} speaking on odd-numbered lines and {host2_name} on even-numbered lines.'}
+            Always include intro with the speaker name and its the podcast name "{podcast_name}" in intoduce the topic of the podcast with "{podcast_topic}".
+            Incorporate the podcast name and topic naturally into the intro and outro, and ensure the content stays relevant to the specified topic throughout the script.
+            """
+            response = model.generate_content(prompt_template)
+            return re.sub(r'[^a-zA-Z0-9\s.,?!<>]', '', response.text), dash.no_update, dash.no_update, dash.no_update
+        except Exception as e:
+            logger.error(f"Error generating podcast script: {str(e)}")
+            return f"Error: {str(e)}", dash.no_update, dash.no_update, dash.no_update
+    elif trigger_id == "generate-audio-btn":
+        if not script_output.strip():
+            return dash.no_update, html.Div("No audio generated yet."), dash.no_update, dash.no_update
+        try:
+            paragraphs = script_output.split('\n\n')  # Split by double newline
+            audio_samples = []
+            for i, paragraph in enumerate(paragraphs):
+                if not paragraph.strip():
+                    continue
+                voice = voice1 if num_hosts == "1" or i % 2 == 0 else voice2
+                input_ids, attention_mask = process_prompt(paragraph, voice, tokenizer, device)
+                with torch.no_grad():
+                    generated_ids = model.generate(
+                        input_ids,
+                        attention_mask=attention_mask,
+                        do_sample=True,
+                        temperature=temperature,
+                        top_p=top_p,
+                        repetition_penalty=repetition_penalty,
+                        max_new_tokens=max_new_tokens,
+                        num_return_sequences=1,
+                        eos_token_id=128258,
+                    )
+                code_list = parse_output(generated_ids)
+                paragraph_audio = redistribute_codes(code_list, snac_model)
+                silences = detect_silence(paragraph_audio)
+                if silences:
+                    paragraph_audio = paragraph_audio[:silences[-1][1]]
+                audio_samples.append(paragraph_audio)
+            final_audio = np.concatenate(audio_samples)
+            final_audio = np.int16(final_audio / np.max(np.abs(final_audio)) * 32767)
+            # Convert to base64 for audio playback
+            audio_base64 = base64.b64encode(final_audio.tobytes()).decode('utf-8')
+            src = f"data:audio/wav;base64,{audio_base64}"
+            return dash.no_update, html.Audio(src=src, controls=True), dash.no_update, dash.no_update
+        except Exception as e:
+            logger.error(f"Error generating speech: {str(e)}")
+            return dash.no_update, html.Div(f"Error generating audio: {str(e)}"), dash.no_update, dash.no_update
+    elif trigger_id == "advanced-settings-toggle":
+        return dash.no_update, dash.no_update, not is_advanced_open, dash.no_update
+    elif trigger_id == "clear-btn":
+        return "", html.Div("No audio generated yet."), dash.no_update, ""
+    return dash.no_update, dash.no_update, dash.no_update, dash.no_update
 # Run the app
 if __name__ == '__main__':