File size: 15,171 Bytes
cabfef2 54f9f2d cabfef2 0616b46 cabfef2 0616b46 cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 54f9f2d cabfef2 d044e82 cabfef2 64338e2 cabfef2 64338e2 cabfef2 64338e2 cabfef2 64338e2 cabfef2 54f9f2d cabfef2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 |
import gradio as gr
from openai import OpenAI
import re
import json
import time
from pydub import AudioSegment
from io import BytesIO
import tempfile
import os
def create_audio_for_segment(client, speaker, content):
"""Generate speech audio for a specific segment in memory"""
# Map speakers to different voices
voice_mapping = {
"Author": "echo", # Professional male voice for the author
"Student": "alloy", # Younger voice for the student
}
# Set speaking style based on speaker role
instructions_mapping = {
"Author": "Speak as an expert researcher explaining technical concepts clearly and confidently.",
"Student": "Speak with curiosity and enthusiasm, as someone eager to learn.",
}
# Get the appropriate voice and instructions
voice = voice_mapping.get(speaker, "nova")
instructions = instructions_mapping.get(speaker, "Speak naturally.")
print(f"Generating audio for {speaker}...")
# Create audio in memory
with client.audio.speech.with_streaming_response.create(
model="gpt-4o-mini-tts",
voice=voice,
input=content,
instructions=instructions,
) as response:
# Read the audio data into memory
audio_data = BytesIO()
for chunk in response.iter_bytes():
audio_data.write(chunk)
audio_data.seek(0)
# Small delay to avoid rate limiting
time.sleep(0.5)
return audio_data
def combine_audio_segments(audio_segments, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
"""Combine multiple audio segments into a single file in memory."""
combined = AudioSegment.empty()
# Load opening and closing sounds if provided
if opening_sound_path and os.path.exists(opening_sound_path):
opening_sound = AudioSegment.from_file(opening_sound_path)
else:
opening_sound = AudioSegment.silent(duration=1000) # 1 second silence as fallback
if closing_sound_path and os.path.exists(closing_sound_path):
closing_sound = AudioSegment.from_file(closing_sound_path)
else:
closing_sound = AudioSegment.silent(duration=1000) # 1 second silence as fallback
# Add a short pause between segments
pause = AudioSegment.silent(duration=500) # 500ms pause
# Start with opening sound
combined += opening_sound + pause
# Add each segment with pause
for audio_data in audio_segments:
audio_data.seek(0) # Reset position
segment = AudioSegment.from_file(audio_data, format="mp3")
combined += segment + pause
# End with closing sound
combined += closing_sound
# Export to bytes
output_buffer = BytesIO()
combined.export(output_buffer, format="mp3")
output_buffer.seek(0)
print("Combined audio with opening/closing created in memory")
return output_buffer.getvalue()
def generate_podcast_from_transcript(client, transcript_data, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
"""Generate a podcast from transcript data in memory"""
segments = transcript_data['segments']
audio_segments = []
for i, segment in enumerate(segments):
speaker = segment['speaker']
content = segment['content']
# Skip very short segments or empty content
if len(content.strip()) < 5:
continue
# Remove any leading "Speaker:" prefix, just in case it's included in content
content = re.sub(r'^\s*(Author|Student)\s*:\s*', '', content, flags=re.IGNORECASE)
# Remove any text in parentheses (including nested ones up to one level deep)
content = re.sub(r'\([^()]*\)', '', content)
# Remove any text in square brackets
content = re.sub(r'\[[^\[\]]*\]', '', content)
# Optionally, strip extra spaces after removing parentheses
content = re.sub(r'\s+', ' ', content).strip()
audio_data = create_audio_for_segment(client, speaker, content)
audio_segments.append(audio_data)
# Combine all audio segments
audio_bytes = combine_audio_segments(audio_segments, opening_sound_path, closing_sound_path)
return audio_bytes
def generate_podcast(file, client, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
"""
Generate a podcast from a file in memory.
Args:
file: Gradio file object with .name attribute pointing to the file path
client: OpenAI client instance
opening_sound_path: Optional path to opening sound file
closing_sound_path: Optional path to closing sound file
Returns:
tuple: (transcript, audio_bytes)
- transcript: JSON string of the conversation transcript
- audio_bytes: MP3 audio data as bytes
"""
# Read file content from the Gradio file object
with open(file.name, "rb") as f:
file_content = f.read()
# Create temporary file for OpenAI API (it requires a file path)
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(file_content)
temp_file_path = temp_file.name
try:
# Upload file to OpenAI
with open(temp_file_path, "rb") as f:
file_obj = client.files.create(file=f, purpose="user_data")
print("Generating conversation transcript...")
# Generate the conversation
response = client.responses.create(
model="gpt-4o",
input=[
{
"role": "user",
"content": [
{
"type": "input_file",
"file_id": file_obj.id,
},
{
"type": "input_text",
"text": "You are a podcast curator creating podcasts for scholars by generating scripts between a paper author and a hypothetical student. You will simulate the entire discussion.\n\nGiven a resource for a paper, generate a discussion between the author of the paper Bob and a student who wants to understand it Alice. The discussion flows naturally and should be almost informal, with the author providing intuitive explanations, analogies, and simple takeaways. During the discussion segments, the student should reason with the author, creating \"aha!\" moments instead of just a Q&A session.\n\nThe roles should be clearly indicated in the script to facilitate parsing of the output. At the end, the student summarizes the entire paper, including its pros and cons.\n\n# Roles\n\n- **Author**: Provides explanations, analogies, and simple takeaways.\n- **Student**: Asks questions, reflects, and provides a summary of the paper.\n\n# Output Format\n\nThe output should clearly delineate each segment of the conversation by marking who is speaking. \n\nExample segment: \n- Author: [Author's explanation or dialogue]\n- Student: [Student's question, reasoning, or concluding summary]\n\n# Notes\n\n- Ensure the interaction is dynamic, with contributions from both the author and the student.\n- Focus on creating an educational yet engaging dialogue.\n- End with a clear, concise summary by the student, highlighting the paper's main points, pros, and cons"
}
]
}
],
text={
"format": {
"type": "json_schema",
"name": "conversation_schema",
"schema": {
"type": "object",
"required": ["segments"],
"properties": {
"segments": {
"type": "array",
"items": {
"type": "object",
"required": ["speaker", "content"],
"properties": {
"content": {
"type": "string",
"description": "The dialogue or content spoken by the speaker."
},
"speaker": {
"type": "string",
"description": "The name of the speaker in the segment."
}
},
"additionalProperties": False
},
"description": "A collection of dialogue segments in the conversation."
}
},
"additionalProperties": False
},
"strict": True
}
},
reasoning={},
tools=[
{
"type": "web_search_preview",
"user_location": {"type": "approximate"},
"search_context_size": "medium"
}
],
tool_choice={"type": "web_search_preview"},
temperature=1.05,
max_output_tokens=4096,
top_p=1,
store=False
)
# Extract transcript
transcript_json = response.model_dump()['output'][1]['content'][0]['text']
transcript_data = json.loads(transcript_json)
print("Generating audio...")
# Generate podcast audio
audio_bytes = generate_podcast_from_transcript(
client,
transcript_data,
opening_sound_path,
closing_sound_path
)
print("Podcast generation completed successfully!")
return transcript_json, audio_bytes
finally:
# Clean up temporary file
os.unlink(temp_file_path)
def gradio_interface(api_key, file):
"""Gradio interface function with proper error handling"""
# Check if API key is provided
if not api_key or not api_key.strip():
gr.Warning("β οΈ OpenAI API Key is required!")
return "", None
# Check if file is uploaded
if not file:
gr.Warning("β οΈ Please upload a PDF file!")
return "", None
try:
# Initialize OpenAI client
client = OpenAI(api_key=api_key.strip())
# Test API key validity with a simple request
try:
client.models.list()
except Exception as auth_error:
if "authentication" in str(auth_error).lower() or "api key" in str(auth_error).lower():
gr.Error("β Invalid OpenAI API Key. Please check your key and try again.")
else:
gr.Error(f"β OpenAI API Error: {str(auth_error)}")
return "", None
# Generate podcast
transcript, audio_bytes = generate_podcast(file, client)
if audio_bytes:
# Create a temporary file for Gradio to serve the audio
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
temp_audio.write(audio_bytes)
temp_audio_path = temp_audio.name
gr.Info("β
Podcast generated successfully!")
return transcript, temp_audio_path
else:
gr.Error("β Failed to generate audio. Please try again.")
return transcript, None
except Exception as e:
error_msg = str(e)
if "rate limit" in error_msg.lower():
gr.Error("β OpenAI API rate limit exceeded. Please wait a moment and try again.")
elif "quota" in error_msg.lower():
gr.Error("β OpenAI API quota exceeded. Please check your account billing.")
elif "authentication" in error_msg.lower() or "api key" in error_msg.lower():
gr.Error("β Invalid OpenAI API Key. Please check your key and try again.")
else:
gr.Error(f"β An error occurred: {error_msg}")
return "", None
# Gradio Interface
with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("# ποΈ podXiv")
gr.Markdown(
"""
*β οΈ We need your OpenAI API Key!*
Welcome to **podXiv**. We convert a PDF paper into an audible podcast!
---
**1. Upload & Analyze**
- Upload your academic paper (could be any PDF)
- AI analyzes the content and structure
**2. Generate Dialogue**
- Creates natural dialogue between author and student
- Focuses on key insights and explanations
**3. Create Audio**
- Combines the dialogue into a final podcast
**Note:** This process may take a few minutes (+/- 80 seconds) as we generate high-quality audio for each segment.
The overall podcast has a length between 2.30 and 5.00 minutes.
Also, the webapp might not work on mobile.
"""
)
with gr.Column(scale=2):
with gr.Group():
api_key_input = gr.Textbox(
label="π€ Your OpenAI API Key",
type="password",
placeholder="sk-...",
info="Your API key is only used for this session and is not stored."
)
file_input = gr.File(
label="π Upload a paper",
file_types=[".pdf"]
)
submit_btn = gr.Button("π¬ Generate Podcast", variant="primary", size="lg")
# Output components
with gr.Accordion("π View Transcript", open=False):
transcript_output = gr.Textbox(
label="Transcript JSON",
lines=10,
interactive=False,
info="Raw JSON transcript of the generated conversation"
)
audio_download = gr.File(
label="π΅ Download Podcast Audio"
)
# Connect the button to the function
submit_btn.click(
fn=gradio_interface,
inputs=[api_key_input, file_input],
outputs=[transcript_output, audio_download],
show_progress=True
)
# Launch the app
if __name__ == "__main__":
demo.launch(
share=True, # Creates a public link
server_name="0.0.0.0", # Allows external connections
server_port=7860 # Default Gradio port
) |