Update app.py
Browse files
app.py
CHANGED
|
@@ -46,7 +46,7 @@ def create_audio_for_segment(client, speaker, content):
|
|
| 46 |
|
| 47 |
return audio_data
|
| 48 |
|
| 49 |
-
def combine_audio_segments(audio_segments, opening_sound_path=
|
| 50 |
"""Combine multiple audio segments into a single file in memory."""
|
| 51 |
combined = AudioSegment.empty()
|
| 52 |
|
|
@@ -127,143 +127,165 @@ def generate_podcast(file, client, opening_sound_path=None, closing_sound_path=N
|
|
| 127 |
closing_sound_path: Optional path to closing sound file
|
| 128 |
|
| 129 |
Returns:
|
| 130 |
-
tuple: (
|
| 131 |
-
- status: "success" or "error"
|
| 132 |
- transcript: JSON string of the conversation transcript
|
| 133 |
-
- audio_bytes: MP3 audio data as bytes
|
| 134 |
"""
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
file_content = f.read()
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
},
|
| 188 |
-
"speaker": {
|
| 189 |
-
"type": "string",
|
| 190 |
-
"description": "The name of the speaker in the segment."
|
| 191 |
-
}
|
| 192 |
},
|
| 193 |
-
"
|
|
|
|
|
|
|
|
|
|
| 194 |
},
|
| 195 |
-
"
|
| 196 |
-
}
|
| 197 |
-
|
| 198 |
-
|
| 199 |
},
|
| 200 |
-
"
|
| 201 |
-
}
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
}
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
|
|
|
|
|
|
| 217 |
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
except Exception as e:
|
| 240 |
-
print(f"Error generating podcast: {str(e)}")
|
| 241 |
-
return "error", str(e), None
|
| 242 |
|
| 243 |
def gradio_interface(api_key, file):
|
| 244 |
-
"""Gradio interface function"""
|
| 245 |
-
if
|
| 246 |
-
|
|
|
|
|
|
|
| 247 |
|
|
|
|
| 248 |
if not file:
|
| 249 |
-
|
|
|
|
| 250 |
|
| 251 |
try:
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
| 256 |
# Create a temporary file for Gradio to serve the audio
|
| 257 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
|
| 258 |
temp_audio.write(audio_bytes)
|
| 259 |
temp_audio_path = temp_audio.name
|
| 260 |
|
| 261 |
-
|
|
|
|
| 262 |
else:
|
| 263 |
-
|
|
|
|
| 264 |
|
| 265 |
except Exception as e:
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
# Gradio Interface
|
| 269 |
with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
|
|
@@ -304,8 +326,6 @@ with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
|
|
| 304 |
submit_btn = gr.Button("π¬ Generate Podcast", variant="primary", size="lg")
|
| 305 |
|
| 306 |
# Output components
|
| 307 |
-
status_output = gr.Textbox(label="π Status", interactive=False)
|
| 308 |
-
|
| 309 |
with gr.Accordion("π View Transcript", open=False):
|
| 310 |
transcript_output = gr.Textbox(
|
| 311 |
label="Transcript JSON",
|
|
@@ -322,7 +342,7 @@ with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
|
|
| 322 |
submit_btn.click(
|
| 323 |
fn=gradio_interface,
|
| 324 |
inputs=[api_key_input, file_input],
|
| 325 |
-
outputs=[
|
| 326 |
show_progress=True
|
| 327 |
)
|
| 328 |
|
|
|
|
| 46 |
|
| 47 |
return audio_data
|
| 48 |
|
| 49 |
+
def combine_audio_segments(audio_segments, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
|
| 50 |
"""Combine multiple audio segments into a single file in memory."""
|
| 51 |
combined = AudioSegment.empty()
|
| 52 |
|
|
|
|
| 127 |
closing_sound_path: Optional path to closing sound file
|
| 128 |
|
| 129 |
Returns:
|
| 130 |
+
tuple: (transcript, audio_bytes)
|
|
|
|
| 131 |
- transcript: JSON string of the conversation transcript
|
| 132 |
+
- audio_bytes: MP3 audio data as bytes
|
| 133 |
"""
|
| 134 |
+
# Read file content from the Gradio file object
|
| 135 |
+
with open(file.name, "rb") as f:
|
| 136 |
+
file_content = f.read()
|
|
|
|
| 137 |
|
| 138 |
+
# Create temporary file for OpenAI API (it requires a file path)
|
| 139 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
|
| 140 |
+
temp_file.write(file_content)
|
| 141 |
+
temp_file_path = temp_file.name
|
| 142 |
|
| 143 |
+
try:
|
| 144 |
+
# Upload file to OpenAI
|
| 145 |
+
with open(temp_file_path, "rb") as f:
|
| 146 |
+
file_obj = client.files.create(file=f, purpose="user_data")
|
| 147 |
|
| 148 |
+
print("Generating conversation transcript...")
|
| 149 |
+
|
| 150 |
+
# Generate the conversation
|
| 151 |
+
response = client.responses.create(
|
| 152 |
+
model="gpt-4o",
|
| 153 |
+
input=[
|
| 154 |
+
{
|
| 155 |
+
"role": "user",
|
| 156 |
+
"content": [
|
| 157 |
+
{
|
| 158 |
+
"type": "input_file",
|
| 159 |
+
"file_id": file_obj.id,
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
"type": "input_text",
|
| 163 |
+
"text": "You are a podcast curator creating podcasts for scholars by generating scripts between a paper author and a hypothetical student. You will simulate the entire discussion.\n\nGiven a resource for a paper, generate a discussion between the author of the paper Bob and a student who wants to understand it Alice. The discussion flows naturally and should be almost informal, with the author providing intuitive explanations, analogies, and simple takeaways. During the discussion segments, the student should reason with the author, creating \"aha!\" moments instead of just a Q&A session.\n\nThe roles should be clearly indicated in the script to facilitate parsing of the output. At the end, the student summarizes the entire paper, including its pros and cons.\n\n# Roles\n\n- **Author**: Provides explanations, analogies, and simple takeaways.\n- **Student**: Asks questions, reflects, and provides a summary of the paper.\n\n# Output Format\n\nThe output should clearly delineate each segment of the conversation by marking who is speaking. \n\nExample segment: \n- Author: [Author's explanation or dialogue]\n- Student: [Student's question, reasoning, or concluding summary]\n\n# Notes\n\n- Ensure the interaction is dynamic, with contributions from both the author and the student.\n- Focus on creating an educational yet engaging dialogue.\n- End with a clear, concise summary by the student, highlighting the paper's main points, pros, and cons"
|
| 164 |
+
}
|
| 165 |
+
]
|
| 166 |
+
}
|
| 167 |
+
],
|
| 168 |
+
text={
|
| 169 |
+
"format": {
|
| 170 |
+
"type": "json_schema",
|
| 171 |
+
"name": "conversation_schema",
|
| 172 |
+
"schema": {
|
| 173 |
+
"type": "object",
|
| 174 |
+
"required": ["segments"],
|
| 175 |
+
"properties": {
|
| 176 |
+
"segments": {
|
| 177 |
+
"type": "array",
|
| 178 |
+
"items": {
|
| 179 |
+
"type": "object",
|
| 180 |
+
"required": ["speaker", "content"],
|
| 181 |
+
"properties": {
|
| 182 |
+
"content": {
|
| 183 |
+
"type": "string",
|
| 184 |
+
"description": "The dialogue or content spoken by the speaker."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
},
|
| 186 |
+
"speaker": {
|
| 187 |
+
"type": "string",
|
| 188 |
+
"description": "The name of the speaker in the segment."
|
| 189 |
+
}
|
| 190 |
},
|
| 191 |
+
"additionalProperties": False
|
| 192 |
+
},
|
| 193 |
+
"description": "A collection of dialogue segments in the conversation."
|
| 194 |
+
}
|
| 195 |
},
|
| 196 |
+
"additionalProperties": False
|
| 197 |
+
},
|
| 198 |
+
"strict": True
|
| 199 |
+
}
|
| 200 |
+
},
|
| 201 |
+
reasoning={},
|
| 202 |
+
tools=[
|
| 203 |
+
{
|
| 204 |
+
"type": "web_search_preview",
|
| 205 |
+
"user_location": {"type": "approximate"},
|
| 206 |
+
"search_context_size": "medium"
|
| 207 |
+
}
|
| 208 |
+
],
|
| 209 |
+
tool_choice={"type": "web_search_preview"},
|
| 210 |
+
temperature=1.05,
|
| 211 |
+
max_output_tokens=4096,
|
| 212 |
+
top_p=1,
|
| 213 |
+
store=False
|
| 214 |
+
)
|
| 215 |
|
| 216 |
+
# Extract transcript
|
| 217 |
+
transcript_json = response.model_dump()['output'][1]['content'][0]['text']
|
| 218 |
+
transcript_data = json.loads(transcript_json)
|
| 219 |
+
|
| 220 |
+
print("Generating audio...")
|
| 221 |
+
|
| 222 |
+
# Generate podcast audio
|
| 223 |
+
audio_bytes = generate_podcast_from_transcript(
|
| 224 |
+
client,
|
| 225 |
+
transcript_data,
|
| 226 |
+
opening_sound_path,
|
| 227 |
+
closing_sound_path
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
print("Podcast generation completed successfully!")
|
| 231 |
+
return transcript_json, audio_bytes
|
| 232 |
+
|
| 233 |
+
finally:
|
| 234 |
+
# Clean up temporary file
|
| 235 |
+
os.unlink(temp_file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
def gradio_interface(api_key, file):
|
| 238 |
+
"""Gradio interface function with proper error handling"""
|
| 239 |
+
# Check if API key is provided
|
| 240 |
+
if not api_key or not api_key.strip():
|
| 241 |
+
gr.Warning("β οΈ OpenAI API Key is required!")
|
| 242 |
+
return "", None
|
| 243 |
|
| 244 |
+
# Check if file is uploaded
|
| 245 |
if not file:
|
| 246 |
+
gr.Warning("β οΈ Please upload a PDF file!")
|
| 247 |
+
return "", None
|
| 248 |
|
| 249 |
try:
|
| 250 |
+
# Initialize OpenAI client
|
| 251 |
+
client = OpenAI(api_key=api_key.strip())
|
| 252 |
+
|
| 253 |
+
# Test API key validity with a simple request
|
| 254 |
+
try:
|
| 255 |
+
client.models.list()
|
| 256 |
+
except Exception as auth_error:
|
| 257 |
+
if "authentication" in str(auth_error).lower() or "api key" in str(auth_error).lower():
|
| 258 |
+
gr.Error("β Invalid OpenAI API Key. Please check your key and try again.")
|
| 259 |
+
else:
|
| 260 |
+
gr.Error(f"β OpenAI API Error: {str(auth_error)}")
|
| 261 |
+
return "", None
|
| 262 |
|
| 263 |
+
# Generate podcast
|
| 264 |
+
transcript, audio_bytes = generate_podcast(file, client)
|
| 265 |
+
|
| 266 |
+
if audio_bytes:
|
| 267 |
# Create a temporary file for Gradio to serve the audio
|
| 268 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
|
| 269 |
temp_audio.write(audio_bytes)
|
| 270 |
temp_audio_path = temp_audio.name
|
| 271 |
|
| 272 |
+
gr.Info("β
Podcast generated successfully!")
|
| 273 |
+
return transcript, temp_audio_path
|
| 274 |
else:
|
| 275 |
+
gr.Error("β Failed to generate audio. Please try again.")
|
| 276 |
+
return transcript, None
|
| 277 |
|
| 278 |
except Exception as e:
|
| 279 |
+
error_msg = str(e)
|
| 280 |
+
if "rate limit" in error_msg.lower():
|
| 281 |
+
gr.Error("β OpenAI API rate limit exceeded. Please wait a moment and try again.")
|
| 282 |
+
elif "quota" in error_msg.lower():
|
| 283 |
+
gr.Error("β OpenAI API quota exceeded. Please check your account billing.")
|
| 284 |
+
elif "authentication" in error_msg.lower() or "api key" in error_msg.lower():
|
| 285 |
+
gr.Error("β Invalid OpenAI API Key. Please check your key and try again.")
|
| 286 |
+
else:
|
| 287 |
+
gr.Error(f"β An error occurred: {error_msg}")
|
| 288 |
+
return "", None
|
| 289 |
|
| 290 |
# Gradio Interface
|
| 291 |
with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
|
|
|
|
| 326 |
submit_btn = gr.Button("π¬ Generate Podcast", variant="primary", size="lg")
|
| 327 |
|
| 328 |
# Output components
|
|
|
|
|
|
|
| 329 |
with gr.Accordion("π View Transcript", open=False):
|
| 330 |
transcript_output = gr.Textbox(
|
| 331 |
label="Transcript JSON",
|
|
|
|
| 342 |
submit_btn.click(
|
| 343 |
fn=gradio_interface,
|
| 344 |
inputs=[api_key_input, file_input],
|
| 345 |
+
outputs=[transcript_output, audio_download],
|
| 346 |
show_progress=True
|
| 347 |
)
|
| 348 |
|