lakj7 commited on
Commit
54f9f2d
Β·
verified Β·
1 Parent(s): 6a0c15f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -117
app.py CHANGED
@@ -46,7 +46,7 @@ def create_audio_for_segment(client, speaker, content):
46
 
47
  return audio_data
48
 
49
- def combine_audio_segments(audio_segments, opening_sound_path=None, closing_sound_path=None):
50
  """Combine multiple audio segments into a single file in memory."""
51
  combined = AudioSegment.empty()
52
 
@@ -127,143 +127,165 @@ def generate_podcast(file, client, opening_sound_path=None, closing_sound_path=N
127
  closing_sound_path: Optional path to closing sound file
128
 
129
  Returns:
130
- tuple: (status, transcript, audio_bytes)
131
- - status: "success" or "error"
132
  - transcript: JSON string of the conversation transcript
133
- - audio_bytes: MP3 audio data as bytes, or None if error
134
  """
135
- try:
136
- # Read file content from the Gradio file object
137
- with open(file.name, "rb") as f:
138
- file_content = f.read()
139
 
140
- # Create temporary file for OpenAI API (it requires a file path)
141
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
142
- temp_file.write(file_content)
143
- temp_file_path = temp_file.name
144
 
145
- try:
146
- # Upload file to OpenAI
147
- with open(temp_file_path, "rb") as f:
148
- file_obj = client.files.create(file=f, purpose="user_data")
149
 
150
- print("Generating conversation transcript...")
151
-
152
- # Generate the conversation
153
- response = client.responses.create(
154
- model="gpt-4o",
155
- input=[
156
- {
157
- "role": "user",
158
- "content": [
159
- {
160
- "type": "input_file",
161
- "file_id": file_obj.id,
162
- },
163
- {
164
- "type": "input_text",
165
- "text": "You are a podcast curator creating podcasts for scholars by generating scripts between a paper author and a hypothetical student. You will simulate the entire discussion.\n\nGiven a resource for a paper, generate a discussion between the author of the paper Bob and a student who wants to understand it Alice. The discussion flows naturally and should be almost informal, with the author providing intuitive explanations, analogies, and simple takeaways. During the discussion segments, the student should reason with the author, creating \"aha!\" moments instead of just a Q&A session.\n\nThe roles should be clearly indicated in the script to facilitate parsing of the output. At the end, the student summarizes the entire paper, including its pros and cons.\n\n# Roles\n\n- **Author**: Provides explanations, analogies, and simple takeaways.\n- **Student**: Asks questions, reflects, and provides a summary of the paper.\n\n# Output Format\n\nThe output should clearly delineate each segment of the conversation by marking who is speaking. \n\nExample segment: \n- Author: [Author's explanation or dialogue]\n- Student: [Student's question, reasoning, or concluding summary]\n\n# Notes\n\n- Ensure the interaction is dynamic, with contributions from both the author and the student.\n- Focus on creating an educational yet engaging dialogue.\n- End with a clear, concise summary by the student, highlighting the paper's main points, pros, and cons"
166
- }
167
- ]
168
- }
169
- ],
170
- text={
171
- "format": {
172
- "type": "json_schema",
173
- "name": "conversation_schema",
174
- "schema": {
175
- "type": "object",
176
- "required": ["segments"],
177
- "properties": {
178
- "segments": {
179
- "type": "array",
180
- "items": {
181
- "type": "object",
182
- "required": ["speaker", "content"],
183
- "properties": {
184
- "content": {
185
- "type": "string",
186
- "description": "The dialogue or content spoken by the speaker."
187
- },
188
- "speaker": {
189
- "type": "string",
190
- "description": "The name of the speaker in the segment."
191
- }
192
  },
193
- "additionalProperties": False
 
 
 
194
  },
195
- "description": "A collection of dialogue segments in the conversation."
196
- }
197
- },
198
- "additionalProperties": False
199
  },
200
- "strict": True
201
- }
202
- },
203
- reasoning={},
204
- tools=[
205
- {
206
- "type": "web_search_preview",
207
- "user_location": {"type": "approximate"},
208
- "search_context_size": "medium"
209
- }
210
- ],
211
- tool_choice={"type": "web_search_preview"},
212
- temperature=1.05,
213
- max_output_tokens=4096,
214
- top_p=1,
215
- store=False
216
- )
 
 
217
 
218
- # Extract transcript
219
- transcript_json = response.model_dump()['output'][1]['content'][0]['text']
220
- transcript_data = json.loads(transcript_json)
221
-
222
- print("Generating audio...")
223
-
224
- # Generate podcast audio
225
- audio_bytes = generate_podcast_from_transcript(
226
- client,
227
- transcript_data,
228
- opening_sound_path,
229
- closing_sound_path
230
- )
231
-
232
- print("Podcast generation completed successfully!")
233
- return "success", transcript_json, audio_bytes
234
-
235
- finally:
236
- # Clean up temporary file
237
- os.unlink(temp_file_path)
238
-
239
- except Exception as e:
240
- print(f"Error generating podcast: {str(e)}")
241
- return "error", str(e), None
242
 
243
  def gradio_interface(api_key, file):
244
- """Gradio interface function"""
245
- if not api_key:
246
- return "error: API key required", "", None
 
 
247
 
 
248
  if not file:
249
- return "error: Please upload a file", "", None
 
250
 
251
  try:
252
- client = OpenAI(api_key=api_key)
253
- status, transcript, audio_bytes = generate_podcast(file, client)
 
 
 
 
 
 
 
 
 
 
254
 
255
- if status == "success" and audio_bytes:
 
 
 
256
  # Create a temporary file for Gradio to serve the audio
257
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
258
  temp_audio.write(audio_bytes)
259
  temp_audio_path = temp_audio.name
260
 
261
- return status, transcript, temp_audio_path
 
262
  else:
263
- return status, transcript, None
 
264
 
265
  except Exception as e:
266
- return f"error: {str(e)}", "", None
 
 
 
 
 
 
 
 
 
267
 
268
  # Gradio Interface
269
  with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
@@ -304,8 +326,6 @@ with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
304
  submit_btn = gr.Button("🎬 Generate Podcast", variant="primary", size="lg")
305
 
306
  # Output components
307
- status_output = gr.Textbox(label="πŸ“Š Status", interactive=False)
308
-
309
  with gr.Accordion("πŸ“ View Transcript", open=False):
310
  transcript_output = gr.Textbox(
311
  label="Transcript JSON",
@@ -322,7 +342,7 @@ with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
322
  submit_btn.click(
323
  fn=gradio_interface,
324
  inputs=[api_key_input, file_input],
325
- outputs=[status_output, transcript_output, audio_download],
326
  show_progress=True
327
  )
328
 
 
46
 
47
  return audio_data
48
 
49
+ def combine_audio_segments(audio_segments, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
50
  """Combine multiple audio segments into a single file in memory."""
51
  combined = AudioSegment.empty()
52
 
 
127
  closing_sound_path: Optional path to closing sound file
128
 
129
  Returns:
130
+ tuple: (transcript, audio_bytes)
 
131
  - transcript: JSON string of the conversation transcript
132
+ - audio_bytes: MP3 audio data as bytes
133
  """
134
+ # Read file content from the Gradio file object
135
+ with open(file.name, "rb") as f:
136
+ file_content = f.read()
 
137
 
138
+ # Create temporary file for OpenAI API (it requires a file path)
139
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
140
+ temp_file.write(file_content)
141
+ temp_file_path = temp_file.name
142
 
143
+ try:
144
+ # Upload file to OpenAI
145
+ with open(temp_file_path, "rb") as f:
146
+ file_obj = client.files.create(file=f, purpose="user_data")
147
 
148
+ print("Generating conversation transcript...")
149
+
150
+ # Generate the conversation
151
+ response = client.responses.create(
152
+ model="gpt-4o",
153
+ input=[
154
+ {
155
+ "role": "user",
156
+ "content": [
157
+ {
158
+ "type": "input_file",
159
+ "file_id": file_obj.id,
160
+ },
161
+ {
162
+ "type": "input_text",
163
+ "text": "You are a podcast curator creating podcasts for scholars by generating scripts between a paper author and a hypothetical student. You will simulate the entire discussion.\n\nGiven a resource for a paper, generate a discussion between the author of the paper Bob and a student who wants to understand it Alice. The discussion flows naturally and should be almost informal, with the author providing intuitive explanations, analogies, and simple takeaways. During the discussion segments, the student should reason with the author, creating \"aha!\" moments instead of just a Q&A session.\n\nThe roles should be clearly indicated in the script to facilitate parsing of the output. At the end, the student summarizes the entire paper, including its pros and cons.\n\n# Roles\n\n- **Author**: Provides explanations, analogies, and simple takeaways.\n- **Student**: Asks questions, reflects, and provides a summary of the paper.\n\n# Output Format\n\nThe output should clearly delineate each segment of the conversation by marking who is speaking. \n\nExample segment: \n- Author: [Author's explanation or dialogue]\n- Student: [Student's question, reasoning, or concluding summary]\n\n# Notes\n\n- Ensure the interaction is dynamic, with contributions from both the author and the student.\n- Focus on creating an educational yet engaging dialogue.\n- End with a clear, concise summary by the student, highlighting the paper's main points, pros, and cons"
164
+ }
165
+ ]
166
+ }
167
+ ],
168
+ text={
169
+ "format": {
170
+ "type": "json_schema",
171
+ "name": "conversation_schema",
172
+ "schema": {
173
+ "type": "object",
174
+ "required": ["segments"],
175
+ "properties": {
176
+ "segments": {
177
+ "type": "array",
178
+ "items": {
179
+ "type": "object",
180
+ "required": ["speaker", "content"],
181
+ "properties": {
182
+ "content": {
183
+ "type": "string",
184
+ "description": "The dialogue or content spoken by the speaker."
 
 
 
 
 
185
  },
186
+ "speaker": {
187
+ "type": "string",
188
+ "description": "The name of the speaker in the segment."
189
+ }
190
  },
191
+ "additionalProperties": False
192
+ },
193
+ "description": "A collection of dialogue segments in the conversation."
194
+ }
195
  },
196
+ "additionalProperties": False
197
+ },
198
+ "strict": True
199
+ }
200
+ },
201
+ reasoning={},
202
+ tools=[
203
+ {
204
+ "type": "web_search_preview",
205
+ "user_location": {"type": "approximate"},
206
+ "search_context_size": "medium"
207
+ }
208
+ ],
209
+ tool_choice={"type": "web_search_preview"},
210
+ temperature=1.05,
211
+ max_output_tokens=4096,
212
+ top_p=1,
213
+ store=False
214
+ )
215
 
216
+ # Extract transcript
217
+ transcript_json = response.model_dump()['output'][1]['content'][0]['text']
218
+ transcript_data = json.loads(transcript_json)
219
+
220
+ print("Generating audio...")
221
+
222
+ # Generate podcast audio
223
+ audio_bytes = generate_podcast_from_transcript(
224
+ client,
225
+ transcript_data,
226
+ opening_sound_path,
227
+ closing_sound_path
228
+ )
229
+
230
+ print("Podcast generation completed successfully!")
231
+ return transcript_json, audio_bytes
232
+
233
+ finally:
234
+ # Clean up temporary file
235
+ os.unlink(temp_file_path)
 
 
 
 
236
 
237
  def gradio_interface(api_key, file):
238
+ """Gradio interface function with proper error handling"""
239
+ # Check if API key is provided
240
+ if not api_key or not api_key.strip():
241
+ gr.Warning("⚠️ OpenAI API Key is required!")
242
+ return "", None
243
 
244
+ # Check if file is uploaded
245
  if not file:
246
+ gr.Warning("⚠️ Please upload a PDF file!")
247
+ return "", None
248
 
249
  try:
250
+ # Initialize OpenAI client
251
+ client = OpenAI(api_key=api_key.strip())
252
+
253
+ # Test API key validity with a simple request
254
+ try:
255
+ client.models.list()
256
+ except Exception as auth_error:
257
+ if "authentication" in str(auth_error).lower() or "api key" in str(auth_error).lower():
258
+ gr.Error("❌ Invalid OpenAI API Key. Please check your key and try again.")
259
+ else:
260
+ gr.Error(f"❌ OpenAI API Error: {str(auth_error)}")
261
+ return "", None
262
 
263
+ # Generate podcast
264
+ transcript, audio_bytes = generate_podcast(file, client)
265
+
266
+ if audio_bytes:
267
  # Create a temporary file for Gradio to serve the audio
268
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
269
  temp_audio.write(audio_bytes)
270
  temp_audio_path = temp_audio.name
271
 
272
+ gr.Info("βœ… Podcast generated successfully!")
273
+ return transcript, temp_audio_path
274
  else:
275
+ gr.Error("❌ Failed to generate audio. Please try again.")
276
+ return transcript, None
277
 
278
  except Exception as e:
279
+ error_msg = str(e)
280
+ if "rate limit" in error_msg.lower():
281
+ gr.Error("❌ OpenAI API rate limit exceeded. Please wait a moment and try again.")
282
+ elif "quota" in error_msg.lower():
283
+ gr.Error("❌ OpenAI API quota exceeded. Please check your account billing.")
284
+ elif "authentication" in error_msg.lower() or "api key" in error_msg.lower():
285
+ gr.Error("❌ Invalid OpenAI API Key. Please check your key and try again.")
286
+ else:
287
+ gr.Error(f"❌ An error occurred: {error_msg}")
288
+ return "", None
289
 
290
  # Gradio Interface
291
  with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
 
326
  submit_btn = gr.Button("🎬 Generate Podcast", variant="primary", size="lg")
327
 
328
  # Output components
 
 
329
  with gr.Accordion("πŸ“ View Transcript", open=False):
330
  transcript_output = gr.Textbox(
331
  label="Transcript JSON",
 
342
  submit_btn.click(
343
  fn=gradio_interface,
344
  inputs=[api_key_input, file_input],
345
+ outputs=[transcript_output, audio_download],
346
  show_progress=True
347
  )
348