Spaces:

lakj7
/

podXiv

Sleeping

App Files Files Community

podXiv / app.py

lakj7

Update app.py

d044e82 verified 4 months ago

raw

history blame contribute delete

15.2 kB

	import gradio as gr
	from openai import OpenAI
	import re
	import json
	import time
	from pydub import AudioSegment
	from io import BytesIO
	import tempfile
	import os

	def create_audio_for_segment(client, speaker, content):
	"""Generate speech audio for a specific segment in memory"""
	# Map speakers to different voices
	voice_mapping = {
	"Author": "echo", # Professional male voice for the author
	"Student": "alloy", # Younger voice for the student
	}

	# Set speaking style based on speaker role
	instructions_mapping = {
	"Author": "Speak as an expert researcher explaining technical concepts clearly and confidently.",
	"Student": "Speak with curiosity and enthusiasm, as someone eager to learn.",
	}

	# Get the appropriate voice and instructions
	voice = voice_mapping.get(speaker, "nova")
	instructions = instructions_mapping.get(speaker, "Speak naturally.")

	print(f"Generating audio for {speaker}...")

	# Create audio in memory
	with client.audio.speech.with_streaming_response.create(
	model="gpt-4o-mini-tts",
	voice=voice,
	input=content,
	instructions=instructions,
	) as response:
	# Read the audio data into memory
	audio_data = BytesIO()
	for chunk in response.iter_bytes():
	audio_data.write(chunk)
	audio_data.seek(0)

	# Small delay to avoid rate limiting
	time.sleep(0.5)

	return audio_data

	def combine_audio_segments(audio_segments, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
	"""Combine multiple audio segments into a single file in memory."""
	combined = AudioSegment.empty()

	# Load opening and closing sounds if provided
	if opening_sound_path and os.path.exists(opening_sound_path):
	opening_sound = AudioSegment.from_file(opening_sound_path)
	else:
	opening_sound = AudioSegment.silent(duration=1000) # 1 second silence as fallback

	if closing_sound_path and os.path.exists(closing_sound_path):
	closing_sound = AudioSegment.from_file(closing_sound_path)
	else:
	closing_sound = AudioSegment.silent(duration=1000) # 1 second silence as fallback

	# Add a short pause between segments
	pause = AudioSegment.silent(duration=500) # 500ms pause

	# Start with opening sound
	combined += opening_sound + pause

	# Add each segment with pause
	for audio_data in audio_segments:
	audio_data.seek(0) # Reset position
	segment = AudioSegment.from_file(audio_data, format="mp3")
	combined += segment + pause

	# End with closing sound
	combined += closing_sound

	# Export to bytes
	output_buffer = BytesIO()
	combined.export(output_buffer, format="mp3")
	output_buffer.seek(0)

	print("Combined audio with opening/closing created in memory")
	return output_buffer.getvalue()

	def generate_podcast_from_transcript(client, transcript_data, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
	"""Generate a podcast from transcript data in memory"""
	segments = transcript_data['segments']
	audio_segments = []

	for i, segment in enumerate(segments):
	speaker = segment['speaker']
	content = segment['content']

	# Skip very short segments or empty content
	if len(content.strip()) < 5:
	continue

	# Remove any leading "Speaker:" prefix, just in case it's included in content
	content = re.sub(r'^\s(Author\|Student)\s:\s*', '', content, flags=re.IGNORECASE)

	# Remove any text in parentheses (including nested ones up to one level deep)
	content = re.sub(r'\([^()]*\)', '', content)

	# Remove any text in square brackets
	content = re.sub(r'\[[^\[\]]*\]', '', content)

	# Optionally, strip extra spaces after removing parentheses
	content = re.sub(r'\s+', ' ', content).strip()

	audio_data = create_audio_for_segment(client, speaker, content)
	audio_segments.append(audio_data)

	# Combine all audio segments
	audio_bytes = combine_audio_segments(audio_segments, opening_sound_path, closing_sound_path)
	return audio_bytes

	def generate_podcast(file, client, opening_sound_path="codecopen.wav", closing_sound_path="codecover.wav"):
	"""
	Generate a podcast from a file in memory.

	Args:
	file: Gradio file object with .name attribute pointing to the file path
	client: OpenAI client instance
	opening_sound_path: Optional path to opening sound file
	closing_sound_path: Optional path to closing sound file

	Returns:
	tuple: (transcript, audio_bytes)
	- transcript: JSON string of the conversation transcript
	- audio_bytes: MP3 audio data as bytes
	"""
	# Read file content from the Gradio file object
	with open(file.name, "rb") as f:
	file_content = f.read()

	# Create temporary file for OpenAI API (it requires a file path)
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
	temp_file.write(file_content)
	temp_file_path = temp_file.name

	try:
	# Upload file to OpenAI
	with open(temp_file_path, "rb") as f:
	file_obj = client.files.create(file=f, purpose="user_data")

	print("Generating conversation transcript...")

	# Generate the conversation
	response = client.responses.create(
	model="gpt-4o",
	input=[
	{
	"role": "user",
	"content": [
	{
	"type": "input_file",
	"file_id": file_obj.id,
	},
	{
	"type": "input_text",
	"text": "You are a podcast curator creating podcasts for scholars by generating scripts between a paper author and a hypothetical student. You will simulate the entire discussion.\n\nGiven a resource for a paper, generate a discussion between the author of the paper Bob and a student who wants to understand it Alice. The discussion flows naturally and should be almost informal, with the author providing intuitive explanations, analogies, and simple takeaways. During the discussion segments, the student should reason with the author, creating \"aha!\" moments instead of just a Q&A session.\n\nThe roles should be clearly indicated in the script to facilitate parsing of the output. At the end, the student summarizes the entire paper, including its pros and cons.\n\n# Roles\n\n- Author: Provides explanations, analogies, and simple takeaways.\n- Student: Asks questions, reflects, and provides a summary of the paper.\n\n# Output Format\n\nThe output should clearly delineate each segment of the conversation by marking who is speaking. \n\nExample segment: \n- Author: [Author's explanation or dialogue]\n- Student: [Student's question, reasoning, or concluding summary]\n\n# Notes\n\n- Ensure the interaction is dynamic, with contributions from both the author and the student.\n- Focus on creating an educational yet engaging dialogue.\n- End with a clear, concise summary by the student, highlighting the paper's main points, pros, and cons"
	}
	]
	}
	],
	text={
	"format": {
	"type": "json_schema",
	"name": "conversation_schema",
	"schema": {
	"type": "object",
	"required": ["segments"],
	"properties": {
	"segments": {
	"type": "array",
	"items": {
	"type": "object",
	"required": ["speaker", "content"],
	"properties": {
	"content": {
	"type": "string",
	"description": "The dialogue or content spoken by the speaker."
	},
	"speaker": {
	"type": "string",
	"description": "The name of the speaker in the segment."
	}
	},
	"additionalProperties": False
	},
	"description": "A collection of dialogue segments in the conversation."
	}
	},
	"additionalProperties": False
	},
	"strict": True
	}
	},
	reasoning={},
	tools=[
	{
	"type": "web_search_preview",
	"user_location": {"type": "approximate"},
	"search_context_size": "medium"
	}
	],
	tool_choice={"type": "web_search_preview"},
	temperature=1.05,
	max_output_tokens=4096,
	top_p=1,
	store=False
	)

	# Extract transcript
	transcript_json = response.model_dump()['output'][1]['content'][0]['text']
	transcript_data = json.loads(transcript_json)

	print("Generating audio...")

	# Generate podcast audio
	audio_bytes = generate_podcast_from_transcript(
	client,
	transcript_data,
	opening_sound_path,
	closing_sound_path
	)

	print("Podcast generation completed successfully!")
	return transcript_json, audio_bytes

	finally:
	# Clean up temporary file
	os.unlink(temp_file_path)

	def gradio_interface(api_key, file):
	"""Gradio interface function with proper error handling"""
	# Check if API key is provided
	if not api_key or not api_key.strip():
	gr.Warning("⚠️ OpenAI API Key is required!")
	return "", None

	# Check if file is uploaded
	if not file:
	gr.Warning("⚠️ Please upload a PDF file!")
	return "", None

	try:
	# Initialize OpenAI client
	client = OpenAI(api_key=api_key.strip())

	# Test API key validity with a simple request
	try:
	client.models.list()
	except Exception as auth_error:
	if "authentication" in str(auth_error).lower() or "api key" in str(auth_error).lower():
	gr.Error("❌ Invalid OpenAI API Key. Please check your key and try again.")
	else:
	gr.Error(f"❌ OpenAI API Error: {str(auth_error)}")
	return "", None

	# Generate podcast
	transcript, audio_bytes = generate_podcast(file, client)

	if audio_bytes:
	# Create a temporary file for Gradio to serve the audio
	with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_audio:
	temp_audio.write(audio_bytes)
	temp_audio_path = temp_audio.name

	gr.Info("✅ Podcast generated successfully!")
	return transcript, temp_audio_path
	else:
	gr.Error("❌ Failed to generate audio. Please try again.")
	return transcript, None

	except Exception as e:
	error_msg = str(e)
	if "rate limit" in error_msg.lower():
	gr.Error("❌ OpenAI API rate limit exceeded. Please wait a moment and try again.")
	elif "quota" in error_msg.lower():
	gr.Error("❌ OpenAI API quota exceeded. Please check your account billing.")
	elif "authentication" in error_msg.lower() or "api key" in error_msg.lower():
	gr.Error("❌ Invalid OpenAI API Key. Please check your key and try again.")
	else:
	gr.Error(f"❌ An error occurred: {error_msg}")
	return "", None

	# Gradio Interface
	with gr.Blocks(title="podXiv - Academic Paper to Podcast") as demo:
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("# 🎙️ podXiv")
	gr.Markdown(
	"""
	⚠️ We need your OpenAI API Key!

	Welcome to podXiv. We convert a PDF paper into an audible podcast!

	---

	1. Upload & Analyze
	- Upload your academic paper (could be any PDF)
	- AI analyzes the content and structure

	2. Generate Dialogue
	- Creates natural dialogue between author and student
	- Focuses on key insights and explanations

	3. Create Audio
	- Combines the dialogue into a final podcast

	Note: This process may take a few minutes (+/- 80 seconds) as we generate high-quality audio for each segment.
	The overall podcast has a length between 2.30 and 5.00 minutes.
	Also, the webapp might not work on mobile.
	"""
	)
	with gr.Column(scale=2):
	with gr.Group():
	api_key_input = gr.Textbox(
	label="🤖 Your OpenAI API Key",
	type="password",
	placeholder="sk-...",
	info="Your API key is only used for this session and is not stored."
	)
	file_input = gr.File(
	label="📄 Upload a paper",
	file_types=[".pdf"]
	)
	submit_btn = gr.Button("🎬 Generate Podcast", variant="primary", size="lg")

	# Output components
	with gr.Accordion("📝 View Transcript", open=False):
	transcript_output = gr.Textbox(
	label="Transcript JSON",
	lines=10,
	interactive=False,
	info="Raw JSON transcript of the generated conversation"
	)

	audio_download = gr.File(
	label="🎵 Download Podcast Audio"
	)

	# Connect the button to the function
	submit_btn.click(
	fn=gradio_interface,
	inputs=[api_key_input, file_input],
	outputs=[transcript_output, audio_download],
	show_progress=True
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(
	share=True, # Creates a public link
	server_name="0.0.0.0", # Allows external connections
	server_port=7860 # Default Gradio port
	)