File size: 15,171 Bytes
cabfef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54f9f2d
cabfef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0616b46
cabfef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0616b46
cabfef2
 
 
 
 
 
 
 
 
 
54f9f2d
cabfef2
54f9f2d
cabfef2
54f9f2d
 
 
cabfef2
54f9f2d
 
 
 
cabfef2
54f9f2d
 
 
 
cabfef2
54f9f2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cabfef2
54f9f2d
 
 
 
cabfef2
54f9f2d
 
 
 
cabfef2
54f9f2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cabfef2
54f9f2d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cabfef2
 
54f9f2d
 
 
 
 
cabfef2
54f9f2d
cabfef2
54f9f2d
 
cabfef2
 
54f9f2d
 
 
 
 
 
 
 
 
 
 
 
cabfef2
54f9f2d
 
 
 
cabfef2
 
 
 
 
54f9f2d
 
cabfef2
54f9f2d
 
cabfef2
 
54f9f2d
 
 
 
 
 
 
 
 
 
cabfef2
 
 
 
 
 
 
 
 
d044e82
 
 
 
cabfef2
 
64338e2
cabfef2
 
64338e2
cabfef2
 
 
 
64338e2
cabfef2
64338e2
 
 
cabfef2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54f9f2d
cabfef2
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import gradio as gr
from openai import OpenAI
import re
import json
import time
from pydub import AudioSegment
from io import BytesIO
import tempfile
import os

def create_audio_for_segment(client, speaker, content):
    """Generate speech audio for a specific segment in memory"""
    # Map speakers to different voices
    voice_mapping = {
        "Author": "echo",     # Professional male voice for the author
        "Student": "alloy",   # Younger voice for the student
    }
    
    # Set speaking style based on speaker role
    instructions_mapping = {
        "Author": "Speak as an expert researcher explaining technical concepts clearly and confidently.",
        "Student": "Speak with curiosity and enthusiasm, as someone eager to learn.",
    }
    
    # Get the appropriate voice and instructions
    voice = voice_mapping.get(speaker, "nova")
    instructions = instructions_mapping.get(speaker, "Speak naturally.")
    
    print(f"Generating audio for {speaker}...")
    
    # Create audio in memory
    with client.audio.speech.with_streaming_response.create(
        model="gpt-4o-mini-tts",
        voice=voice,
        input=content,
        instructions=instructions,
    ) as response:
        # Read the audio data into memory
        audio_data = BytesIO()
        for chunk in response.iter_bytes():
            audio_data.write(chunk)
        audio_data.seek(0)
    
    # Small delay to avoid rate limiting
    time.sleep(0.5)
    
    return audio_data

def combine_audio_segments(audio_segments, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
    """Combine multiple audio segments into a single file in memory."""
    combined = AudioSegment.empty()
    
    # Load opening and closing sounds if provided
    if opening_sound_path and os.path.exists(opening_sound_path):
        opening_sound = AudioSegment.from_file(opening_sound_path)
    else:
        opening_sound = AudioSegment.silent(duration=1000)  # 1 second silence as fallback
    
    if closing_sound_path and os.path.exists(closing_sound_path):
        closing_sound = AudioSegment.from_file(closing_sound_path)
    else:
        closing_sound = AudioSegment.silent(duration=1000)  # 1 second silence as fallback
    
    # Add a short pause between segments
    pause = AudioSegment.silent(duration=500)  # 500ms pause

    # Start with opening sound
    combined += opening_sound + pause

    # Add each segment with pause
    for audio_data in audio_segments:
        audio_data.seek(0)  # Reset position
        segment = AudioSegment.from_file(audio_data, format="mp3")
        combined += segment + pause

    # End with closing sound
    combined += closing_sound

    # Export to bytes
    output_buffer = BytesIO()
    combined.export(output_buffer, format="mp3")
    output_buffer.seek(0)
    
    print("Combined audio with opening/closing created in memory")
    return output_buffer.getvalue()

def generate_podcast_from_transcript(client, transcript_data, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
    """Generate a podcast from transcript data in memory"""
    segments = transcript_data['segments']
    audio_segments = []
        
    for i, segment in enumerate(segments):
        speaker = segment['speaker']
        content = segment['content']

        # Skip very short segments or empty content
        if len(content.strip()) < 5:
            continue

        # Remove any leading "Speaker:" prefix, just in case it's included in content
        content = re.sub(r'^\s*(Author|Student)\s*:\s*', '', content, flags=re.IGNORECASE)

        # Remove any text in parentheses (including nested ones up to one level deep)
        content = re.sub(r'\([^()]*\)', '', content)

        # Remove any text in square brackets
        content = re.sub(r'\[[^\[\]]*\]', '', content)

        # Optionally, strip extra spaces after removing parentheses
        content = re.sub(r'\s+', ' ', content).strip()

        audio_data = create_audio_for_segment(client, speaker, content)
        audio_segments.append(audio_data)
        
    # Combine all audio segments
    audio_bytes = combine_audio_segments(audio_segments, opening_sound_path, closing_sound_path)
    return audio_bytes

def generate_podcast(file, client, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
    """
    Generate a podcast from a file in memory.
    
    Args:
        file: Gradio file object with .name attribute pointing to the file path
        client: OpenAI client instance
        opening_sound_path: Optional path to opening sound file
        closing_sound_path: Optional path to closing sound file
    
    Returns:
        tuple: (transcript, audio_bytes)
            - transcript: JSON string of the conversation transcript
            - audio_bytes: MP3 audio data as bytes
    """
    # Read file content from the Gradio file object
    with open(file.name, "rb") as f:
        file_content = f.read()

    # Create temporary file for OpenAI API (it requires a file path)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
        temp_file.write(file_content)
        temp_file_path = temp_file.name

    try:
        # Upload file to OpenAI
        with open(temp_file_path, "rb") as f:
            file_obj = client.files.create(file=f, purpose="user_data")

        print("Generating conversation transcript...")
        
        # Generate the conversation
        response = client.responses.create(
            model="gpt-4o",
            input=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "input_file",
                            "file_id": file_obj.id,
                        },
                        {
                            "type": "input_text",
                            "text": "You are a podcast curator creating podcasts for scholars by generating scripts between a paper author and a hypothetical student. You will simulate the entire discussion.\n\nGiven a resource for a paper, generate a discussion between the author of the paper Bob and a student who wants to understand it Alice. The discussion flows naturally and should be almost informal, with the author providing intuitive explanations, analogies, and simple takeaways. During the discussion segments, the student should reason with the author, creating \"aha!\" moments instead of just a Q&A session.\n\nThe roles should be clearly indicated in the script to facilitate parsing of the output. At the end, the student summarizes the entire paper, including its pros and cons.\n\n# Roles\n\n- **Author**: Provides explanations, analogies, and simple takeaways.\n- **Student**: Asks questions, reflects, and provides a summary of the paper.\n\n# Output Format\n\nThe output should clearly delineate each segment of the conversation by marking who is speaking. \n\nExample segment: \n- Author: [Author's explanation or dialogue]\n- Student: [Student's question, reasoning, or concluding summary]\n\n# Notes\n\n- Ensure the interaction is dynamic, with contributions from both the author and the student.\n- Focus on creating an educational yet engaging dialogue.\n- End with a clear, concise summary by the student, highlighting the paper's main points, pros, and cons"
                        }
                    ]
                }
            ],
            text={
                "format": {
                    "type": "json_schema",
                    "name": "conversation_schema",
                    "schema": {
                        "type": "object",
                        "required": ["segments"],
                        "properties": {
                            "segments": {
                                "type": "array",
                                "items": {
                                    "type": "object",
                                    "required": ["speaker", "content"],
                                    "properties": {
                                        "content": {
                                            "type": "string",
                                            "description": "The dialogue or content spoken by the speaker."
                                        },
                                        "speaker": {
                                            "type": "string",
                                            "description": "The name of the speaker in the segment."
                                        }
                                    },
                                    "additionalProperties": False
                                },
                                "description": "A collection of dialogue segments in the conversation."
                            }
                        },
                        "additionalProperties": False
                    },
                    "strict": True
                }
            },
            reasoning={},
            tools=[
                {
                    "type": "web_search_preview",
                    "user_location": {"type": "approximate"},
                    "search_context_size": "medium"
                }
            ],
            tool_choice={"type": "web_search_preview"},
            temperature=1.05,
            max_output_tokens=4096,
            top_p=1,
            store=False
        )

        # Extract transcript
        transcript_json = response.model_dump()['output'][1]['content'][0]['text']
        transcript_data = json.loads(transcript_json)
        
        print("Generating audio...")
        
        # Generate podcast audio
        audio_bytes = generate_podcast_from_transcript(
            client, 
            transcript_data, 
            opening_sound_path, 
            closing_sound_path
        )
        
        print("Podcast generation completed successfully!")
        return transcript_json, audio_bytes
        
    finally:
        # Clean up temporary file
        os.unlink(temp_file_path)

def gradio_interface(api_key, file):
    """Gradio interface function with proper error handling"""
    # Check if API key is provided
    if not api_key or not api_key.strip():
        gr.Warning("⚠️ OpenAI API Key is required!")
        return "", None
    
    # Check if file is uploaded
    if not file:
        gr.Warning("⚠️ Please upload a PDF file!")
        return "", None
    
    try:
        # Initialize OpenAI client
        client = OpenAI(api_key=api_key.strip())
        
        # Test API key validity with a simple request
        try:
            client.models.list()
        except Exception as auth_error:
            if "authentication" in str(auth_error).lower() or "api key" in str(auth_error).lower():
                gr.Error("❌ Invalid OpenAI API Key. Please check your key and try again.")
            else:
                gr.Error(f"❌ OpenAI API Error: {str(auth_error)}")
            return "", None
        
        # Generate podcast
        transcript, audio_bytes = generate_podcast(file, client)
        
        if audio_bytes:
            # Create a temporary file for Gradio to serve the audio
            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
                temp_audio.write(audio_bytes)
                temp_audio_path = temp_audio.name
            
            gr.Info("βœ… Podcast generated successfully!")
            return transcript, temp_audio_path
        else:
            gr.Error("❌ Failed to generate audio. Please try again.")
            return transcript, None
            
    except Exception as e:
        error_msg = str(e)
        if "rate limit" in error_msg.lower():
            gr.Error("❌ OpenAI API rate limit exceeded. Please wait a moment and try again.")
        elif "quota" in error_msg.lower():
            gr.Error("❌ OpenAI API quota exceeded. Please check your account billing.")
        elif "authentication" in error_msg.lower() or "api key" in error_msg.lower():
            gr.Error("❌ Invalid OpenAI API Key. Please check your key and try again.")
        else:
            gr.Error(f"❌ An error occurred: {error_msg}")
        return "", None

# Gradio Interface
with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("# πŸŽ™οΈ podXiv")
            gr.Markdown(
                """
                *⚠️ We need your OpenAI API Key!*

                Welcome to **podXiv**. We convert a PDF paper into an audible podcast!

                ---
               
                **1. Upload & Analyze**
                - Upload your academic paper (could be any PDF)
                - AI analyzes the content and structure
               
                **2. Generate Dialogue**
                - Creates natural dialogue between author and student
                - Focuses on key insights and explanations
               
                **3. Create Audio**
                - Combines the dialogue into a final podcast
                
                **Note:** This process may take a few minutes (+/- 80 seconds) as we generate high-quality audio for each segment. 
                The overall podcast has a length between 2.30 and 5.00 minutes.
                Also, the webapp might not work on mobile.
                """
            )
        with gr.Column(scale=2):
            with gr.Group():
                api_key_input = gr.Textbox(
                    label="πŸ€– Your OpenAI API Key",
                    type="password",
                    placeholder="sk-...",
                    info="Your API key is only used for this session and is not stored."
                )
                file_input = gr.File(
                    label="πŸ“„ Upload a paper", 
                    file_types=[".pdf"]
                )
                submit_btn = gr.Button("🎬 Generate Podcast", variant="primary", size="lg")
                
                # Output components
                with gr.Accordion("πŸ“ View Transcript", open=False):
                    transcript_output = gr.Textbox(
                        label="Transcript JSON", 
                        lines=10, 
                        interactive=False,
                        info="Raw JSON transcript of the generated conversation"
                    )
                
                audio_download = gr.File(
                    label="🎡 Download Podcast Audio"
                )
                
                # Connect the button to the function
                submit_btn.click(
                    fn=gradio_interface,
                    inputs=[api_key_input, file_input],
                    outputs=[transcript_output, audio_download],
                    show_progress=True
                )

# Launch the app
if __name__ == "__main__":
    demo.launch(
        share=True,  # Creates a public link
        server_name="0.0.0.0",  # Allows external connections
        server_port=7860  # Default Gradio port
    )