import asyncio import os import time from dataclasses import dataclass from typing import List, Optional, AsyncGenerator import gradio as gr from dotenv import load_dotenv from langchain_openai import ChatOpenAI from rich.console import Console from rich.panel import Panel from rich.text import Text from logger import setup_logger, log_execution_time, log_async_execution_time from browser_use import Agent, Browser from browser_use.browser.browser import BrowserContext from api_clients import OpenRouterClient, ElevenLabsClient load_dotenv() console = Console() logger = setup_logger("interface") @dataclass class ActionResult: is_done: bool extracted_content: Optional[str] error: Optional[str] include_in_memory: bool @dataclass class AgentHistoryList: all_results: List[ActionResult] all_model_outputs: List[dict] def parse_agent_history(history_str: str) -> None: # Split the content into sections based on ActionResult entries sections = history_str.split('ActionResult(') for i, section in enumerate(sections[1:], 1): # Skip first empty section # Extract relevant information content = '' if 'extracted_content=' in section: content = section.split('extracted_content=')[1].split(',')[0].strip("'") if content: header = Text(f'Step {i}', style='bold blue') panel = Panel(content, title=header, border_style='blue') console.print(panel) console.print() async def run_browser_task( task: str, api_key: str, provider: str = 'openai', model: str = 'gpt-4-vision', headless: bool = True, ) -> str: if not api_key.strip(): return 'Please provide an API key' if provider == 'openai': os.environ['OPENAI_API_KEY'] = api_key llm = ChatOpenAI(model=model) elif provider == 'anthropic': os.environ['ANTHROPIC_API_KEY'] = api_key llm = ChatAnthropic(model=model) else: # google os.environ['GOOGLE_API_KEY'] = api_key llm = ChatGoogleGenerativeAI(model=model) try: agent = Agent( task=task, llm=llm, browser=Browser(BrowserContext(headless=True)) ) result = await agent.run() # TODO: The result cloud be parsed better return result except Exception as e: return f'Error: {str(e)}' @log_async_execution_time(logger) async def scrape_content(url: str) -> str: """ Scrape and summarize content from the given URL using browser automation This function performs the following steps: 1. Validates the input URL 2. Initializes the browser agent 3. Extracts and summarizes the content Args: url: Target URL to scrape Returns: Summarized content suitable for podcast generation Raises: ValueError: If URL is invalid or content extraction fails """ logger.info(f"Starting content scrape for URL: {url}") # Input validation if not url.startswith(('http://', 'https://')): logger.error(f"Invalid URL format: {url}") raise ValueError("URL must start with http:// or https://") try: logger.debug("Initializing LLM and browser agent") llm = ChatOpenAI(model="gpt-4") agent = Agent( task=f"Visit this URL: {url} and extract the main content. Summarize it in a clear and concise way.", llm=llm, browser=Browser(BrowserContext(headless=True)) ) logger.info("Executing content extraction") result = await agent.run() logger.debug(f"Content extraction successful. Length: {len(result)} chars") logger.debug(f"Content preview: {result[:200]}...") return result except Exception as e: logger.error(f"Content extraction failed for {url}", exc_info=True) raise @log_async_execution_time(logger) async def create_podcast( url: str, prompt: str, elevenlabs_key: str, voice_id: str, openrouter_key: str, model_id: str, ) -> AsyncGenerator[tuple[Optional[str], str], None]: """ Create a podcast through a multi-step process: 1. Content extraction from URL 2. Script generation using AI 3. Voice synthesis Progress updates are yielded at each step for UI feedback. """ logger.info(f"Starting podcast creation for URL: {url}") logger.debug(f"Parameters - Voice: {voice_id}, Model: {model_id}") logger.debug(f"Prompt length: {len(prompt)} chars") try: # Initialize clients with validation logger.debug("Initializing API clients") openrouter = OpenRouterClient(openrouter_key) elevenlabs = ElevenLabsClient(elevenlabs_key) # Phase 1: Content scraping logger.info("Phase 1/3: Content scraping") yield None, "Scraping website content..." content = await scrape_content(url) logger.debug(f"Scraped content length: {len(content)} chars") # Phase 2: Script generation logger.info("Phase 2/3: Script generation") yield None, "Generating podcast script..." script = await openrouter.generate_script(content, prompt, model_id) logger.debug(f"Generated script length: {len(script)} chars") # Phase 3: Audio synthesis logger.info("Phase 3/3: Audio generation") yield None, "Converting to audio..." audio = elevenlabs.generate_audio(script, voice_id) logger.debug(f"Generated audio size: {len(audio)} bytes") # Save output audio_path = f"podcast_{int(time.time())}.mp3" logger.debug(f"Saving audio to: {audio_path}") with open(audio_path, "wb") as f: f.write(audio) logger.info("Podcast creation completed successfully") yield audio_path, "Podcast created successfully!" except Exception as e: logger.error("Podcast creation failed", exc_info=True) yield None, f"Error: {str(e)}" def create_ui(): logger.info("Initializing Gradio interface") # Default choices for dropdowns default_voices = [("", "Enter API key to load voices")] default_models = [("", "Enter API key to load models")] with gr.Blocks(title='PodcastCreator', theme=gr.themes.Soft()) as interface: with gr.Row(): with gr.Column(scale=2): url_input = gr.Textbox(label='Source URL', placeholder='Enter the URL...') prompt = gr.Textbox(label='Podcast Topic', lines=3) with gr.Row(): with gr.Column(): elevenlabs_key = gr.Textbox( label='ElevenLabs API Key', type='password', placeholder='Enter key...' ) voice = gr.Dropdown( label='Voice', choices=default_voices, value=None, allow_custom_value=True ) with gr.Column(): openrouter_key = gr.Textbox( label='OpenRouter API Key', type='password', placeholder='Enter key...' ) model = gr.Dropdown( label='AI Model', choices=default_models, value=None, allow_custom_value=True ) submit_btn = gr.Button('Create Podcast', variant='primary') with gr.Column(scale=1): audio_output = gr.Audio(label="Generated Podcast") status = gr.Textbox(label='Status', interactive=False) # Event handlers def update_voices(key): if not key: return gr.Dropdown(choices=default_voices, value=default_voices[0][0]) try: client = ElevenLabsClient(key) voices = client.get_voices() return gr.Dropdown(choices=voices, value=voices[0][0] if voices else None) except Exception as e: logger.error(f"Failed to load voices: {e}") return gr.Dropdown(choices=[(None, f"Error: {str(e)}")], value=None) async def update_models(key): if not key: return gr.Dropdown(choices=default_models, value=default_models[0][0]) try: client = OpenRouterClient(key) models = await client.get_models() return gr.Dropdown(choices=models, value=models[0][0] if models else None) except Exception as e: logger.error(f"Failed to load models: {e}") return gr.Dropdown(choices=[(None, f"Error: {str(e)}")], value=None) # Add error handling for the event handlers try: elevenlabs_key.change(fn=update_voices, inputs=elevenlabs_key, outputs=voice) openrouter_key.change(fn=update_models, inputs=openrouter_key, outputs=model) submit_btn.click( fn=create_podcast, inputs=[url_input, prompt, elevenlabs_key, voice, openrouter_key, model], outputs=[audio_output, status] ) except Exception as e: logger.error(f"Failed to set up event handlers: {e}") raise logger.info("Gradio interface initialized successfully") return interface if __name__ == '__main__': demo = create_ui() demo.launch()