|
import gradio as gr |
|
from openai import OpenAI |
|
import re |
|
import json |
|
import time |
|
from pydub import AudioSegment |
|
from io import BytesIO |
|
import tempfile |
|
import os |
|
|
|
def create_audio_for_segment(client, speaker, content): |
|
"""Generate speech audio for a specific segment in memory""" |
|
|
|
voice_mapping = { |
|
"Author": "echo", |
|
"Student": "alloy", |
|
} |
|
|
|
|
|
instructions_mapping = { |
|
"Author": "Speak as an expert researcher explaining technical concepts clearly and confidently.", |
|
"Student": "Speak with curiosity and enthusiasm, as someone eager to learn.", |
|
} |
|
|
|
|
|
voice = voice_mapping.get(speaker, "nova") |
|
instructions = instructions_mapping.get(speaker, "Speak naturally.") |
|
|
|
print(f"Generating audio for {speaker}...") |
|
|
|
|
|
with client.audio.speech.with_streaming_response.create( |
|
model="gpt-4o-mini-tts", |
|
voice=voice, |
|
input=content, |
|
instructions=instructions, |
|
) as response: |
|
|
|
audio_data = BytesIO() |
|
for chunk in response.iter_bytes(): |
|
audio_data.write(chunk) |
|
audio_data.seek(0) |
|
|
|
|
|
time.sleep(0.5) |
|
|
|
return audio_data |
|
|
|
def combine_audio_segments(audio_segments, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"): |
|
"""Combine multiple audio segments into a single file in memory.""" |
|
combined = AudioSegment.empty() |
|
|
|
|
|
if opening_sound_path and os.path.exists(opening_sound_path): |
|
opening_sound = AudioSegment.from_file(opening_sound_path) |
|
else: |
|
opening_sound = AudioSegment.silent(duration=1000) |
|
|
|
if closing_sound_path and os.path.exists(closing_sound_path): |
|
closing_sound = AudioSegment.from_file(closing_sound_path) |
|
else: |
|
closing_sound = AudioSegment.silent(duration=1000) |
|
|
|
|
|
pause = AudioSegment.silent(duration=500) |
|
|
|
|
|
combined += opening_sound + pause |
|
|
|
|
|
for audio_data in audio_segments: |
|
audio_data.seek(0) |
|
segment = AudioSegment.from_file(audio_data, format="mp3") |
|
combined += segment + pause |
|
|
|
|
|
combined += closing_sound |
|
|
|
|
|
output_buffer = BytesIO() |
|
combined.export(output_buffer, format="mp3") |
|
output_buffer.seek(0) |
|
|
|
print("Combined audio with opening/closing created in memory") |
|
return output_buffer.getvalue() |
|
|
|
def generate_podcast_from_transcript(client, transcript_data, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"): |
|
"""Generate a podcast from transcript data in memory""" |
|
segments = transcript_data['segments'] |
|
audio_segments = [] |
|
|
|
for i, segment in enumerate(segments): |
|
speaker = segment['speaker'] |
|
content = segment['content'] |
|
|
|
|
|
if len(content.strip()) < 5: |
|
continue |
|
|
|
|
|
content = re.sub(r'^\s*(Author|Student)\s*:\s*', '', content, flags=re.IGNORECASE) |
|
|
|
|
|
content = re.sub(r'\([^()]*\)', '', content) |
|
|
|
|
|
content = re.sub(r'\[[^\[\]]*\]', '', content) |
|
|
|
|
|
content = re.sub(r'\s+', ' ', content).strip() |
|
|
|
audio_data = create_audio_for_segment(client, speaker, content) |
|
audio_segments.append(audio_data) |
|
|
|
|
|
audio_bytes = combine_audio_segments(audio_segments, opening_sound_path, closing_sound_path) |
|
return audio_bytes |
|
|
|
def generate_podcast(file, client, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"): |
|
""" |
|
Generate a podcast from a file in memory. |
|
|
|
Args: |
|
file: Gradio file object with .name attribute pointing to the file path |
|
client: OpenAI client instance |
|
opening_sound_path: Optional path to opening sound file |
|
closing_sound_path: Optional path to closing sound file |
|
|
|
Returns: |
|
tuple: (transcript, audio_bytes) |
|
- transcript: JSON string of the conversation transcript |
|
- audio_bytes: MP3 audio data as bytes |
|
""" |
|
|
|
with open(file.name, "rb") as f: |
|
file_content = f.read() |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: |
|
temp_file.write(file_content) |
|
temp_file_path = temp_file.name |
|
|
|
try: |
|
|
|
with open(temp_file_path, "rb") as f: |
|
file_obj = client.files.create(file=f, purpose="user_data") |
|
|
|
print("Generating conversation transcript...") |
|
|
|
|
|
response = client.responses.create( |
|
model="gpt-4o", |
|
input=[ |
|
{ |
|
"role": "user", |
|
"content": [ |
|
{ |
|
"type": "input_file", |
|
"file_id": file_obj.id, |
|
}, |
|
{ |
|
"type": "input_text", |
|
"text": "You are a podcast curator creating podcasts for scholars by generating scripts between a paper author and a hypothetical student. You will simulate the entire discussion.\n\nGiven a resource for a paper, generate a discussion between the author of the paper Bob and a student who wants to understand it Alice. The discussion flows naturally and should be almost informal, with the author providing intuitive explanations, analogies, and simple takeaways. During the discussion segments, the student should reason with the author, creating \"aha!\" moments instead of just a Q&A session.\n\nThe roles should be clearly indicated in the script to facilitate parsing of the output. At the end, the student summarizes the entire paper, including its pros and cons.\n\n# Roles\n\n- **Author**: Provides explanations, analogies, and simple takeaways.\n- **Student**: Asks questions, reflects, and provides a summary of the paper.\n\n# Output Format\n\nThe output should clearly delineate each segment of the conversation by marking who is speaking. \n\nExample segment: \n- Author: [Author's explanation or dialogue]\n- Student: [Student's question, reasoning, or concluding summary]\n\n# Notes\n\n- Ensure the interaction is dynamic, with contributions from both the author and the student.\n- Focus on creating an educational yet engaging dialogue.\n- End with a clear, concise summary by the student, highlighting the paper's main points, pros, and cons" |
|
} |
|
] |
|
} |
|
], |
|
text={ |
|
"format": { |
|
"type": "json_schema", |
|
"name": "conversation_schema", |
|
"schema": { |
|
"type": "object", |
|
"required": ["segments"], |
|
"properties": { |
|
"segments": { |
|
"type": "array", |
|
"items": { |
|
"type": "object", |
|
"required": ["speaker", "content"], |
|
"properties": { |
|
"content": { |
|
"type": "string", |
|
"description": "The dialogue or content spoken by the speaker." |
|
}, |
|
"speaker": { |
|
"type": "string", |
|
"description": "The name of the speaker in the segment." |
|
} |
|
}, |
|
"additionalProperties": False |
|
}, |
|
"description": "A collection of dialogue segments in the conversation." |
|
} |
|
}, |
|
"additionalProperties": False |
|
}, |
|
"strict": True |
|
} |
|
}, |
|
reasoning={}, |
|
tools=[ |
|
{ |
|
"type": "web_search_preview", |
|
"user_location": {"type": "approximate"}, |
|
"search_context_size": "medium" |
|
} |
|
], |
|
tool_choice={"type": "web_search_preview"}, |
|
temperature=1.05, |
|
max_output_tokens=4096, |
|
top_p=1, |
|
store=False |
|
) |
|
|
|
|
|
transcript_json = response.model_dump()['output'][1]['content'][0]['text'] |
|
transcript_data = json.loads(transcript_json) |
|
|
|
print("Generating audio...") |
|
|
|
|
|
audio_bytes = generate_podcast_from_transcript( |
|
client, |
|
transcript_data, |
|
opening_sound_path, |
|
closing_sound_path |
|
) |
|
|
|
print("Podcast generation completed successfully!") |
|
return transcript_json, audio_bytes |
|
|
|
finally: |
|
|
|
os.unlink(temp_file_path) |
|
|
|
def gradio_interface(api_key, file): |
|
"""Gradio interface function with proper error handling""" |
|
|
|
if not api_key or not api_key.strip(): |
|
gr.Warning("β οΈ OpenAI API Key is required!") |
|
return "", None |
|
|
|
|
|
if not file: |
|
gr.Warning("β οΈ Please upload a PDF file!") |
|
return "", None |
|
|
|
try: |
|
|
|
client = OpenAI(api_key=api_key.strip()) |
|
|
|
|
|
try: |
|
client.models.list() |
|
except Exception as auth_error: |
|
if "authentication" in str(auth_error).lower() or "api key" in str(auth_error).lower(): |
|
gr.Error("β Invalid OpenAI API Key. Please check your key and try again.") |
|
else: |
|
gr.Error(f"β OpenAI API Error: {str(auth_error)}") |
|
return "", None |
|
|
|
|
|
transcript, audio_bytes = generate_podcast(file, client) |
|
|
|
if audio_bytes: |
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio: |
|
temp_audio.write(audio_bytes) |
|
temp_audio_path = temp_audio.name |
|
|
|
gr.Info("β
Podcast generated successfully!") |
|
return transcript, temp_audio_path |
|
else: |
|
gr.Error("β Failed to generate audio. Please try again.") |
|
return transcript, None |
|
|
|
except Exception as e: |
|
error_msg = str(e) |
|
if "rate limit" in error_msg.lower(): |
|
gr.Error("β OpenAI API rate limit exceeded. Please wait a moment and try again.") |
|
elif "quota" in error_msg.lower(): |
|
gr.Error("β OpenAI API quota exceeded. Please check your account billing.") |
|
elif "authentication" in error_msg.lower() or "api key" in error_msg.lower(): |
|
gr.Error("β Invalid OpenAI API Key. Please check your key and try again.") |
|
else: |
|
gr.Error(f"β An error occurred: {error_msg}") |
|
return "", None |
|
|
|
|
|
with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo: |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr.Markdown("# ποΈ podXiv") |
|
gr.Markdown( |
|
""" |
|
*β οΈ We need your OpenAI API Key!* |
|
|
|
Welcome to **podXiv**. We convert a PDF paper into an audible podcast! |
|
|
|
--- |
|
|
|
**1. Upload & Analyze** |
|
- Upload your academic paper (could be any PDF) |
|
- AI analyzes the content and structure |
|
|
|
**2. Generate Dialogue** |
|
- Creates natural dialogue between author and student |
|
- Focuses on key insights and explanations |
|
|
|
**3. Create Audio** |
|
- Combines the dialogue into a final podcast |
|
|
|
**Note:** This process may take a few minutes (+/- 80 seconds) as we generate high-quality audio for each segment. |
|
The overall podcast has a length between 2.30 and 5.00 minutes. |
|
Also, the webapp might not work on mobile. |
|
""" |
|
) |
|
with gr.Column(scale=2): |
|
with gr.Group(): |
|
api_key_input = gr.Textbox( |
|
label="π€ Your OpenAI API Key", |
|
type="password", |
|
placeholder="sk-...", |
|
info="Your API key is only used for this session and is not stored." |
|
) |
|
file_input = gr.File( |
|
label="π Upload a paper", |
|
file_types=[".pdf"] |
|
) |
|
submit_btn = gr.Button("π¬ Generate Podcast", variant="primary", size="lg") |
|
|
|
|
|
with gr.Accordion("π View Transcript", open=False): |
|
transcript_output = gr.Textbox( |
|
label="Transcript JSON", |
|
lines=10, |
|
interactive=False, |
|
info="Raw JSON transcript of the generated conversation" |
|
) |
|
|
|
audio_download = gr.File( |
|
label="π΅ Download Podcast Audio" |
|
) |
|
|
|
|
|
submit_btn.click( |
|
fn=gradio_interface, |
|
inputs=[api_key_input, file_input], |
|
outputs=[transcript_output, audio_download], |
|
show_progress=True |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
demo.launch( |
|
share=True, |
|
server_name="0.0.0.0", |
|
server_port=7860 |
|
) |