lakj7 commited on
Commit
cabfef2
Β·
verified Β·
1 Parent(s): 671e5f9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +335 -0
app.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from openai import OpenAI
3
+ import re
4
+ import json
5
+ import time
6
+ from pydub import AudioSegment
7
+ from io import BytesIO
8
+ import tempfile
9
+ import os
10
+
11
+ def create_audio_for_segment(client, speaker, content):
12
+ """Generate speech audio for a specific segment in memory"""
13
+ # Map speakers to different voices
14
+ voice_mapping = {
15
+ "Author": "echo", # Professional male voice for the author
16
+ "Student": "alloy", # Younger voice for the student
17
+ }
18
+
19
+ # Set speaking style based on speaker role
20
+ instructions_mapping = {
21
+ "Author": "Speak as an expert researcher explaining technical concepts clearly and confidently.",
22
+ "Student": "Speak with curiosity and enthusiasm, as someone eager to learn.",
23
+ }
24
+
25
+ # Get the appropriate voice and instructions
26
+ voice = voice_mapping.get(speaker, "nova")
27
+ instructions = instructions_mapping.get(speaker, "Speak naturally.")
28
+
29
+ print(f"Generating audio for {speaker}...")
30
+
31
+ # Create audio in memory
32
+ with client.audio.speech.with_streaming_response.create(
33
+ model="gpt-4o-mini-tts",
34
+ voice=voice,
35
+ input=content,
36
+ instructions=instructions,
37
+ ) as response:
38
+ # Read the audio data into memory
39
+ audio_data = BytesIO()
40
+ for chunk in response.iter_bytes():
41
+ audio_data.write(chunk)
42
+ audio_data.seek(0)
43
+
44
+ # Small delay to avoid rate limiting
45
+ time.sleep(0.5)
46
+
47
+ return audio_data
48
+
49
+ def combine_audio_segments(audio_segments, opening_sound_path=None, closing_sound_path=None):
50
+ """Combine multiple audio segments into a single file in memory."""
51
+ combined = AudioSegment.empty()
52
+
53
+ # Load opening and closing sounds if provided
54
+ if opening_sound_path and os.path.exists(opening_sound_path):
55
+ opening_sound = AudioSegment.from_file(opening_sound_path)
56
+ else:
57
+ opening_sound = AudioSegment.silent(duration=1000) # 1 second silence as fallback
58
+
59
+ if closing_sound_path and os.path.exists(closing_sound_path):
60
+ closing_sound = AudioSegment.from_file(closing_sound_path)
61
+ else:
62
+ closing_sound = AudioSegment.silent(duration=1000) # 1 second silence as fallback
63
+
64
+ # Add a short pause between segments
65
+ pause = AudioSegment.silent(duration=500) # 500ms pause
66
+
67
+ # Start with opening sound
68
+ combined += opening_sound + pause
69
+
70
+ # Add each segment with pause
71
+ for audio_data in audio_segments:
72
+ audio_data.seek(0) # Reset position
73
+ segment = AudioSegment.from_file(audio_data, format="mp3")
74
+ combined += segment + pause
75
+
76
+ # End with closing sound
77
+ combined += closing_sound
78
+
79
+ # Export to bytes
80
+ output_buffer = BytesIO()
81
+ combined.export(output_buffer, format="mp3")
82
+ output_buffer.seek(0)
83
+
84
+ print("Combined audio with opening/closing created in memory")
85
+ return output_buffer.getvalue()
86
+
87
+ def generate_podcast_from_transcript(client, transcript_data, opening_sound_path=None, closing_sound_path=None):
88
+ """Generate a podcast from transcript data in memory"""
89
+ segments = transcript_data['segments']
90
+ audio_segments = []
91
+
92
+ for i, segment in enumerate(segments):
93
+ speaker = segment['speaker']
94
+ content = segment['content']
95
+
96
+ # Skip very short segments or empty content
97
+ if len(content.strip()) < 5:
98
+ continue
99
+
100
+ # Remove any leading "Speaker:" prefix, just in case it's included in content
101
+ content = re.sub(r'^\s*(Author|Student)\s*:\s*', '', content, flags=re.IGNORECASE)
102
+
103
+ # Remove any text in parentheses (including nested ones up to one level deep)
104
+ content = re.sub(r'\([^()]*\)', '', content)
105
+
106
+ # Remove any text in square brackets
107
+ content = re.sub(r'\[[^\[\]]*\]', '', content)
108
+
109
+ # Optionally, strip extra spaces after removing parentheses
110
+ content = re.sub(r'\s+', ' ', content).strip()
111
+
112
+ audio_data = create_audio_for_segment(client, speaker, content)
113
+ audio_segments.append(audio_data)
114
+
115
+ # Combine all audio segments
116
+ audio_bytes = combine_audio_segments(audio_segments, opening_sound_path, closing_sound_path)
117
+ return audio_bytes
118
+
119
+ def generate_podcast(file, client, opening_sound_path=None, closing_sound_path=None):
120
+ """
121
+ Generate a podcast from a file in memory.
122
+
123
+ Args:
124
+ file: Gradio file object with .name attribute pointing to the file path
125
+ client: OpenAI client instance
126
+ opening_sound_path: Optional path to opening sound file
127
+ closing_sound_path: Optional path to closing sound file
128
+
129
+ Returns:
130
+ tuple: (status, transcript, audio_bytes)
131
+ - status: "success" or "error"
132
+ - transcript: JSON string of the conversation transcript
133
+ - audio_bytes: MP3 audio data as bytes, or None if error
134
+ """
135
+ try:
136
+ # Read file content from the Gradio file object
137
+ with open(file.name, "rb") as f:
138
+ file_content = f.read()
139
+
140
+ # Create temporary file for OpenAI API (it requires a file path)
141
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
142
+ temp_file.write(file_content)
143
+ temp_file_path = temp_file.name
144
+
145
+ try:
146
+ # Upload file to OpenAI
147
+ with open(temp_file_path, "rb") as f:
148
+ file_obj = client.files.create(file=f, purpose="user_data")
149
+
150
+ print("Generating conversation transcript...")
151
+
152
+ # Generate the conversation
153
+ response = client.responses.create(
154
+ model="gpt-4o",
155
+ input=[
156
+ {
157
+ "role": "user",
158
+ "content": [
159
+ {
160
+ "type": "input_file",
161
+ "file_id": file_obj.id,
162
+ },
163
+ {
164
+ "type": "input_text",
165
+ "text": "You are a podcast curator creating podcasts for scholars by generating scripts between a paper author and a hypothetical student. You will simulate the entire discussion.\n\nGiven a resource for a paper, generate a discussion between the author of the paper Bob and a student who wants to understand it Alice. The discussion flows naturally and should be almost informal, with the author providing intuitive explanations, analogies, and simple takeaways. During the discussion segments, the student should reason with the author, creating \"aha!\" moments instead of just a Q&A session.\n\nThe roles should be clearly indicated in the script to facilitate parsing of the output. At the end, the student summarizes the entire paper, including its pros and cons.\n\n# Roles\n\n- **Author**: Provides explanations, analogies, and simple takeaways.\n- **Student**: Asks questions, reflects, and provides a summary of the paper.\n\n# Output Format\n\nThe output should clearly delineate each segment of the conversation by marking who is speaking. \n\nExample segment: \n- Author: [Author's explanation or dialogue]\n- Student: [Student's question, reasoning, or concluding summary]\n\n# Notes\n\n- Ensure the interaction is dynamic, with contributions from both the author and the student.\n- Focus on creating an educational yet engaging dialogue.\n- End with a clear, concise summary by the student, highlighting the paper's main points, pros, and cons"
166
+ }
167
+ ]
168
+ }
169
+ ],
170
+ text={
171
+ "format": {
172
+ "type": "json_schema",
173
+ "name": "conversation_schema",
174
+ "schema": {
175
+ "type": "object",
176
+ "required": ["segments"],
177
+ "properties": {
178
+ "segments": {
179
+ "type": "array",
180
+ "items": {
181
+ "type": "object",
182
+ "required": ["speaker", "content"],
183
+ "properties": {
184
+ "content": {
185
+ "type": "string",
186
+ "description": "The dialogue or content spoken by the speaker."
187
+ },
188
+ "speaker": {
189
+ "type": "string",
190
+ "description": "The name of the speaker in the segment."
191
+ }
192
+ },
193
+ "additionalProperties": False
194
+ },
195
+ "description": "A collection of dialogue segments in the conversation."
196
+ }
197
+ },
198
+ "additionalProperties": False
199
+ },
200
+ "strict": True
201
+ }
202
+ },
203
+ reasoning={},
204
+ tools=[
205
+ {
206
+ "type": "web_search_preview",
207
+ "user_location": {"type": "approximate"},
208
+ "search_context_size": "medium"
209
+ }
210
+ ],
211
+ tool_choice={"type": "web_search_preview"},
212
+ temperature=1.05,
213
+ max_output_tokens=4096,
214
+ top_p=1,
215
+ store=False
216
+ )
217
+
218
+ # Extract transcript
219
+ transcript_json = response.model_dump()['output'][1]['content'][0]['text']
220
+ transcript_data = json.loads(transcript_json)
221
+
222
+ print("Generating audio...")
223
+
224
+ # Generate podcast audio
225
+ audio_bytes = generate_podcast_from_transcript(
226
+ client,
227
+ transcript_data,
228
+ opening_sound_path,
229
+ closing_sound_path
230
+ )
231
+
232
+ print("Podcast generation completed successfully!")
233
+ return "success", transcript_json, audio_bytes
234
+
235
+ finally:
236
+ # Clean up temporary file
237
+ os.unlink(temp_file_path)
238
+
239
+ except Exception as e:
240
+ print(f"Error generating podcast: {str(e)}")
241
+ return "error", str(e), None
242
+
243
+ def gradio_interface(api_key, file):
244
+ """Gradio interface function"""
245
+ if not api_key:
246
+ return "error: API key required", "", None
247
+
248
+ if not file:
249
+ return "error: Please upload a file", "", None
250
+
251
+ try:
252
+ client = OpenAI(api_key=api_key)
253
+ status, transcript, audio_bytes = generate_podcast(file, client)
254
+
255
+ if status == "success" and audio_bytes:
256
+ # Create a temporary file for Gradio to serve the audio
257
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
258
+ temp_audio.write(audio_bytes)
259
+ temp_audio_path = temp_audio.name
260
+
261
+ return status, transcript, temp_audio_path
262
+ else:
263
+ return status, transcript, None
264
+
265
+ except Exception as e:
266
+ return f"error: {str(e)}", "", None
267
+
268
+ # Gradio Interface
269
+ with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
270
+ with gr.Row():
271
+ with gr.Column(scale=1):
272
+ gr.Markdown("# πŸŽ™οΈ podXiv")
273
+ gr.Markdown(
274
+ """
275
+ *⚠️ We need your OpenAI API Key!*
276
+
277
+ **1. Upload & Analyze**
278
+ - Upload your academic paper (PDF or DOCX)
279
+ - AI analyzes the content and structure
280
+
281
+ **2. Generate Script**
282
+ - Creates natural dialogue between author and student
283
+ - Focuses on key insights and explanations
284
+
285
+ **3. Create Audio**
286
+ - Converts script to high-quality speech
287
+ - Combines segments into final podcast
288
+
289
+ **Note:** This process may take a few minutes as we generate high-quality audio for each segment.
290
+ """
291
+ )
292
+ with gr.Column(scale=2):
293
+ with gr.Group():
294
+ api_key_input = gr.Textbox(
295
+ label="πŸ€– Your OpenAI API Key",
296
+ type="password",
297
+ placeholder="sk-...",
298
+ info="Your API key is only used for this session and is not stored."
299
+ )
300
+ file_input = gr.File(
301
+ label="πŸ“„ Upload a paper",
302
+ file_types=[".pdf"]
303
+ )
304
+ submit_btn = gr.Button("🎬 Generate Podcast", variant="primary", size="lg")
305
+
306
+ # Output components
307
+ status_output = gr.Textbox(label="πŸ“Š Status", interactive=False)
308
+
309
+ with gr.Accordion("πŸ“ View Transcript", open=False):
310
+ transcript_output = gr.Textbox(
311
+ label="Transcript JSON",
312
+ lines=10,
313
+ interactive=False,
314
+ info="Raw JSON transcript of the generated conversation"
315
+ )
316
+
317
+ audio_download = gr.File(
318
+ label="🎡 Download Podcast Audio"
319
+ )
320
+
321
+ # Connect the button to the function
322
+ submit_btn.click(
323
+ fn=gradio_interface,
324
+ inputs=[api_key_input, file_input],
325
+ outputs=[status_output, transcript_output, audio_download],
326
+ show_progress=True
327
+ )
328
+
329
+ # Launch the app
330
+ if __name__ == "__main__":
331
+ demo.launch(
332
+ share=True, # Creates a public link
333
+ server_name="0.0.0.0", # Allows external connections
334
+ server_port=7860 # Default Gradio port
335
+ )