marks commited on
Commit
c405952
·
1 Parent(s): e94fe41
Files changed (4) hide show
  1. interface.py +127 -15
  2. models.py +0 -4
  3. podcast_generator.py +9 -0
  4. tts.py +23 -0
interface.py CHANGED
@@ -2,7 +2,7 @@ import asyncio
2
  import os
3
  import time
4
  from dataclasses import dataclass
5
- from typing import List, Optional, AsyncGenerator, Tuple
6
  import gradio as gr
7
  from dotenv import load_dotenv
8
  from langchain_openai import ChatOpenAI
@@ -10,6 +10,9 @@ from rich.console import Console
10
  from rich.panel import Panel
11
  from rich.text import Text
12
  from logger import setup_logger, log_execution_time, log_async_execution_time
 
 
 
13
  from api_clients import OpenRouterClient, ElevenLabsClient
14
 
15
  load_dotenv()
@@ -17,6 +20,116 @@ load_dotenv()
17
  console = Console()
18
  logger = setup_logger("interface")
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  @log_async_execution_time(logger)
21
  async def create_podcast(
22
  url: str,
@@ -25,12 +138,14 @@ async def create_podcast(
25
  voice_id: str,
26
  openrouter_key: str,
27
  model_id: str,
28
- ) -> Tuple[Optional[str], str]:
29
  """
30
  Create a podcast through a multi-step process:
31
  1. Content extraction from URL
32
  2. Script generation using AI
33
  3. Voice synthesis
 
 
34
  """
35
  logger.info(f"Starting podcast creation for URL: {url}")
36
  logger.debug(f"Parameters - Voice: {voice_id}, Model: {model_id}")
@@ -44,24 +159,21 @@ async def create_podcast(
44
 
45
  # Phase 1: Content scraping
46
  logger.info("Phase 1/3: Content scraping")
47
- if not url.startswith(('http://', 'https://')):
48
- raise ValueError("URL must start with http:// or https://")
49
-
50
- logger.debug("Initializing LLM and browser agent")
51
- llm = ChatOpenAI(model="gpt-4")
52
- task = f"Visit this URL: {url} and extract the main content. Summarize it in a clear and concise way."
53
- content = await llm.apredict(task)
54
  logger.debug(f"Scraped content length: {len(content)} chars")
55
 
56
  # Phase 2: Script generation
57
  logger.info("Phase 2/3: Script generation")
 
58
  script = await openrouter.generate_script(content, prompt, model_id)
59
  logger.debug(f"Generated script length: {len(script)} chars")
60
 
61
  # Phase 3: Audio synthesis
62
  logger.info("Phase 3/3: Audio generation")
63
- audio = await elevenlabs.generate_audio(script, voice_id)
64
- logger.debug(f"Generated audio data received")
 
65
 
66
  # Save output
67
  audio_path = f"podcast_{int(time.time())}.mp3"
@@ -70,11 +182,11 @@ async def create_podcast(
70
  f.write(audio)
71
 
72
  logger.info("Podcast creation completed successfully")
73
- return audio_path, "Podcast created successfully!"
74
 
75
  except Exception as e:
76
  logger.error("Podcast creation failed", exc_info=True)
77
- return None, f"Error: {str(e)}"
78
 
79
  def create_ui():
80
  logger.info("Initializing Gradio interface")
@@ -119,7 +231,7 @@ def create_ui():
119
  submit_btn = gr.Button('Create Podcast', variant='primary')
120
 
121
  with gr.Column(scale=1):
122
- audio_output = gr.Audio(label="Generated Podcast", type="filepath")
123
  status = gr.Textbox(label='Status', interactive=False)
124
 
125
  # Event handlers
@@ -164,4 +276,4 @@ def create_ui():
164
 
165
  if __name__ == '__main__':
166
  demo = create_ui()
167
- demo.queue().launch()
 
2
  import os
3
  import time
4
  from dataclasses import dataclass
5
+ from typing import List, Optional, AsyncGenerator
6
  import gradio as gr
7
  from dotenv import load_dotenv
8
  from langchain_openai import ChatOpenAI
 
10
  from rich.panel import Panel
11
  from rich.text import Text
12
  from logger import setup_logger, log_execution_time, log_async_execution_time
13
+
14
+ from browser_use import Agent, Browser
15
+ from browser_use.browser.browser import BrowserContext
16
  from api_clients import OpenRouterClient, ElevenLabsClient
17
 
18
  load_dotenv()
 
20
  console = Console()
21
  logger = setup_logger("interface")
22
 
23
+ @dataclass
24
+ class ActionResult:
25
+ is_done: bool
26
+ extracted_content: Optional[str]
27
+ error: Optional[str]
28
+ include_in_memory: bool
29
+
30
+
31
+ @dataclass
32
+ class AgentHistoryList:
33
+ all_results: List[ActionResult]
34
+ all_model_outputs: List[dict]
35
+
36
+
37
+ def parse_agent_history(history_str: str) -> None:
38
+ # Split the content into sections based on ActionResult entries
39
+ sections = history_str.split('ActionResult(')
40
+
41
+ for i, section in enumerate(sections[1:], 1): # Skip first empty section
42
+ # Extract relevant information
43
+ content = ''
44
+ if 'extracted_content=' in section:
45
+ content = section.split('extracted_content=')[1].split(',')[0].strip("'")
46
+
47
+ if content:
48
+ header = Text(f'Step {i}', style='bold blue')
49
+ panel = Panel(content, title=header, border_style='blue')
50
+ console.print(panel)
51
+ console.print()
52
+
53
+
54
+ async def run_browser_task(
55
+ task: str,
56
+ api_key: str,
57
+ provider: str = 'openai',
58
+ model: str = 'gpt-4-vision',
59
+ headless: bool = True,
60
+ ) -> str:
61
+ if not api_key.strip():
62
+ return 'Please provide an API key'
63
+
64
+ if provider == 'openai':
65
+ os.environ['OPENAI_API_KEY'] = api_key
66
+ llm = ChatOpenAI(model=model)
67
+ elif provider == 'anthropic':
68
+ os.environ['ANTHROPIC_API_KEY'] = api_key
69
+ llm = ChatAnthropic(model=model)
70
+ else: # google
71
+ os.environ['GOOGLE_API_KEY'] = api_key
72
+ llm = ChatGoogleGenerativeAI(model=model)
73
+
74
+ try:
75
+ agent = Agent(
76
+ task=task,
77
+ llm=llm,
78
+ browser=Browser(BrowserContext(headless=True))
79
+ )
80
+ result = await agent.run()
81
+ # TODO: The result cloud be parsed better
82
+ return result
83
+ except Exception as e:
84
+ return f'Error: {str(e)}'
85
+
86
+
87
+ @log_async_execution_time(logger)
88
+ async def scrape_content(url: str) -> str:
89
+ """
90
+ Scrape and summarize content from the given URL using browser automation
91
+
92
+ This function performs the following steps:
93
+ 1. Validates the input URL
94
+ 2. Initializes the browser agent
95
+ 3. Extracts and summarizes the content
96
+
97
+ Args:
98
+ url: Target URL to scrape
99
+
100
+ Returns:
101
+ Summarized content suitable for podcast generation
102
+
103
+ Raises:
104
+ ValueError: If URL is invalid or content extraction fails
105
+ """
106
+ logger.info(f"Starting content scrape for URL: {url}")
107
+
108
+ # Input validation
109
+ if not url.startswith(('http://', 'https://')):
110
+ logger.error(f"Invalid URL format: {url}")
111
+ raise ValueError("URL must start with http:// or https://")
112
+
113
+ try:
114
+ logger.debug("Initializing LLM and browser agent")
115
+ llm = ChatOpenAI(model="gpt-4")
116
+ agent = Agent(
117
+ task=f"Visit this URL: {url} and extract the main content. Summarize it in a clear and concise way.",
118
+ llm=llm,
119
+ browser=Browser(BrowserContext(headless=True))
120
+ )
121
+
122
+ logger.info("Executing content extraction")
123
+ result = await agent.run()
124
+
125
+ logger.debug(f"Content extraction successful. Length: {len(result)} chars")
126
+ logger.debug(f"Content preview: {result[:200]}...")
127
+
128
+ return result
129
+ except Exception as e:
130
+ logger.error(f"Content extraction failed for {url}", exc_info=True)
131
+ raise
132
+
133
  @log_async_execution_time(logger)
134
  async def create_podcast(
135
  url: str,
 
138
  voice_id: str,
139
  openrouter_key: str,
140
  model_id: str,
141
+ ) -> AsyncGenerator[tuple[Optional[str], str], None]:
142
  """
143
  Create a podcast through a multi-step process:
144
  1. Content extraction from URL
145
  2. Script generation using AI
146
  3. Voice synthesis
147
+
148
+ Progress updates are yielded at each step for UI feedback.
149
  """
150
  logger.info(f"Starting podcast creation for URL: {url}")
151
  logger.debug(f"Parameters - Voice: {voice_id}, Model: {model_id}")
 
159
 
160
  # Phase 1: Content scraping
161
  logger.info("Phase 1/3: Content scraping")
162
+ yield None, "Scraping website content..."
163
+ content = await scrape_content(url)
 
 
 
 
 
164
  logger.debug(f"Scraped content length: {len(content)} chars")
165
 
166
  # Phase 2: Script generation
167
  logger.info("Phase 2/3: Script generation")
168
+ yield None, "Generating podcast script..."
169
  script = await openrouter.generate_script(content, prompt, model_id)
170
  logger.debug(f"Generated script length: {len(script)} chars")
171
 
172
  # Phase 3: Audio synthesis
173
  logger.info("Phase 3/3: Audio generation")
174
+ yield None, "Converting to audio..."
175
+ audio = elevenlabs.generate_audio(script, voice_id)
176
+ logger.debug(f"Generated audio size: {len(audio)} bytes")
177
 
178
  # Save output
179
  audio_path = f"podcast_{int(time.time())}.mp3"
 
182
  f.write(audio)
183
 
184
  logger.info("Podcast creation completed successfully")
185
+ yield audio_path, "Podcast created successfully!"
186
 
187
  except Exception as e:
188
  logger.error("Podcast creation failed", exc_info=True)
189
+ yield None, f"Error: {str(e)}"
190
 
191
  def create_ui():
192
  logger.info("Initializing Gradio interface")
 
231
  submit_btn = gr.Button('Create Podcast', variant='primary')
232
 
233
  with gr.Column(scale=1):
234
+ audio_output = gr.Audio(label="Generated Podcast")
235
  status = gr.Textbox(label='Status', interactive=False)
236
 
237
  # Event handlers
 
276
 
277
  if __name__ == '__main__':
278
  demo = create_ui()
279
+ demo.launch()
models.py CHANGED
@@ -10,13 +10,9 @@ class OpenRouterRequest(BaseModel):
10
  messages: List[Message]
11
 
12
  class OpenRouterChoice(BaseModel):
13
- index: int = 0
14
  message: Message
15
- finish_reason: Optional[str] = None
16
 
17
  class OpenRouterResponse(BaseModel):
18
- id: str
19
- model: str
20
  choices: List[OpenRouterChoice]
21
 
22
  class OpenRouterModel(BaseModel):
 
10
  messages: List[Message]
11
 
12
  class OpenRouterChoice(BaseModel):
 
13
  message: Message
 
14
 
15
  class OpenRouterResponse(BaseModel):
 
 
16
  choices: List[OpenRouterChoice]
17
 
18
  class OpenRouterModel(BaseModel):
podcast_generator.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ class PodcastGenerator:
2
+ def __init__(self, model_client):
3
+ self.model_client = model_client
4
+
5
+ def generate_podcast(self, scraped_content):
6
+ prompt = f"Create a podcast episode based on the following content: {scraped_content}"
7
+ response = self.model_client.generate(prompt, max_length=300) # Assuming 300 tokens is roughly 3 minutes
8
+ podcast_text = response.get('text', '')
9
+ return podcast_text.strip()
tts.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def text_to_speech(text, api_key):
2
+ import requests
3
+
4
+ url = "https://api.elevenlabs.io/v1/text-to-speech"
5
+ headers = {
6
+ "Authorization": f"Bearer {api_key}",
7
+ "Content-Type": "application/json"
8
+ }
9
+ data = {
10
+ "text": text,
11
+ "voice": "en_us_male", # Specify the desired voice
12
+ "output_format": "mp3" # Specify the desired output format
13
+ }
14
+
15
+ response = requests.post(url, headers=headers, json=data)
16
+
17
+ if response.status_code == 200:
18
+ audio_content = response.content
19
+ with open("podcast_episode.mp3", "wb") as audio_file:
20
+ audio_file.write(audio_content)
21
+ return "podcast_episode.mp3"
22
+ else:
23
+ raise Exception(f"Error: {response.status_code}, {response.text}")