Edmond7 commited on
Commit
ab3ef82
·
verified ·
1 Parent(s): 1749c38

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -197
app.py CHANGED
@@ -1,7 +1,5 @@
1
  import os
2
- import random
3
- from fastapi import FastAPI, HTTPException, UploadFile, Depends, Query, Request
4
- from fastapi.responses import JSONResponse
5
  from fastapi.security import APIKeyHeader
6
  import aiohttp
7
  import asyncio
@@ -14,9 +12,6 @@ import boto3
14
  from botocore.exceptions import NoCredentialsError
15
  from duckduckgo_search import DDGS
16
  from bs4 import BeautifulSoup
17
- import requests
18
- from innertube import InnerTube
19
- import ffmpeg
20
 
21
  app = FastAPI()
22
 
@@ -45,12 +40,6 @@ S3_REGION = os.environ.get("S3_REGION")
45
  if not all([API_KEY, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_BUCKET, S3_REGION]):
46
  raise ValueError("Missing required environment variables")
47
 
48
- # User agents for requests
49
- USER_AGENTS = [
50
- 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1',
51
- 'Mozilla/5.0 (iPad; CPU OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1',
52
- 'Mozilla/5.0 (iPod touch; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1'
53
- ]
54
 
55
  async def search_and_get_videos(query: str, num_videos: int = 2) -> List[Dict]:
56
  for instance in INVIDIOUS_INSTANCES:
@@ -76,97 +65,52 @@ async def search_and_get_videos(query: str, num_videos: int = 2) -> List[Dict]:
76
  logger.error("All Invidious instances failed")
77
  return []
78
 
79
- def get_youtube_audio(video_id):
80
- try:
81
- logger.info(f"Starting YouTube audio extraction for video ID: {video_id}")
82
-
83
- # Create an InnerTube client for iOS
84
- client = InnerTube("IOS")
85
- logger.info("InnerTube client created")
86
-
87
- # Fetch video info
88
- logger.info("Fetching video info")
89
- video_info = client.player(video_id)
90
- logger.info("Video info fetched successfully")
91
-
92
- # Check if the video has streaming data
93
- if 'streamingData' not in video_info:
94
- logger.error(f"No 'streamingData' found in video info for video ID {video_id}")
95
- return {'success': False, 'error': "No streaming data found for the video"}
96
-
97
- # Extract the audio streams
98
- streams = video_info["streamingData"]["adaptiveFormats"]
99
- audio_streams = [s for s in streams if s['mimeType'].startswith('audio/')]
100
-
101
- if not audio_streams:
102
- logger.warning(f"No audio streams found for video ID {video_id}")
103
- return {'success': False, 'error': "No audio streams found"}
104
-
105
- # Choose the highest quality audio stream
106
- audio_stream = max(audio_streams, key=lambda x: x.get('bitrate', 0))
107
- audio_url = audio_stream['url']
108
- logger.info(f"Selected audio stream URL: {audio_url[:100]}...") # Log first 100 chars of URL
109
-
110
- # Prepare headers
111
- headers = {
112
- 'User-Agent': random.choice(USER_AGENTS),
113
- 'Accept': '*/*',
114
- 'Accept-Encoding': 'gzip, deflate, br',
115
- 'Range': 'bytes=0-',
116
- 'Connection': 'keep-alive',
117
- }
118
-
119
- # Download the audio
120
- logger.info("Starting audio download")
121
- response = requests.get(audio_url, headers=headers, stream=True)
122
- logger.info(f"Download response status code: {response.status_code}")
123
-
124
- if response.status_code in [200, 206]: # 200 OK or 206 Partial Content
125
- # Create a temporary file for the downloaded audio
126
- with tempfile.NamedTemporaryFile(delete=False, suffix='.webm') as temp_file:
127
- temp_file_path = temp_file.name
128
- logger.info(f"Created temporary file: {temp_file_path}")
129
- # Write the audio data to the file
130
- total_size = 0
131
- for chunk in response.iter_content(chunk_size=8192):
132
- temp_file.write(chunk)
133
- total_size += len(chunk)
134
- if total_size % (1024 * 1024) == 0: # Log every 1MB
135
- logger.info(f"Downloaded {total_size / (1024 * 1024):.2f} MB")
136
-
137
- logger.info(f"Audio download completed. Total size: {total_size / (1024 * 1024):.2f} MB")
138
-
139
- # Convert the downloaded audio to MP3
140
- mp3_temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3')
141
- mp3_temp_file_path = mp3_temp_file.name
142
- mp3_temp_file.close()
143
-
144
- logger.info(f"Converting audio to MP3: {mp3_temp_file_path}")
145
-
146
  try:
147
- (
148
- ffmpeg
149
- .input(temp_file_path)
150
- .output(mp3_temp_file_path, acodec='libmp3lame', audio_bitrate='128k')
151
- .overwrite_output()
152
- .run(capture_stdout=True, capture_stderr=True)
153
- )
154
- logger.info("Audio conversion to MP3 completed successfully")
155
- except ffmpeg.Error as e:
156
- logger.error(f"Error during audio conversion: {e.stderr.decode()}")
157
- return {'success': False, 'error': "Failed to convert audio to MP3"}
158
- finally:
159
- # Remove the original downloaded file
160
- os.unlink(temp_file_path)
161
-
162
- return {'success': True, 'temp_file_path': mp3_temp_file_path}
163
- else:
164
- logger.error(f"Failed to download audio: HTTP {response.status_code}")
165
- return {'success': False, 'error': f"Download failed: HTTP {response.status_code}"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
- except Exception as e:
168
- logger.exception(f"Error fetching YouTube audio for video ID {video_id}: {str(e)}")
169
- return {'success': False, 'error': f"Error: {str(e)}"}
170
 
171
  def extract_text_from_document(file: UploadFile) -> dict:
172
  try:
@@ -198,12 +142,7 @@ def upload_to_s3(local_file, s3_file):
198
  )
199
 
200
  try:
201
- s3_client.upload_file(
202
- local_file,
203
- S3_BUCKET,
204
- s3_file,
205
- ExtraArgs={'ContentDisposition': 'attachment'}
206
- )
207
  s3_url = f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com/{s3_file}"
208
  return s3_url
209
  except NoCredentialsError:
@@ -243,37 +182,6 @@ async def verify_api_key(api_key: str = Depends(api_key_header)):
243
  raise HTTPException(status_code=401, detail="Invalid API Key")
244
  return api_key
245
 
246
- def download_stream(url, output_path):
247
- headers = {
248
- 'User-Agent': random.choice(USER_AGENTS),
249
- 'Accept': '*/*',
250
- 'Accept-Encoding': 'gzip, deflate, br',
251
- 'Range': 'bytes=0-',
252
- 'Connection': 'keep-alive',
253
- }
254
-
255
- with requests.get(url, headers=headers, stream=True) as r:
256
- r.raise_for_status()
257
- with open(output_path, 'wb') as f:
258
- for chunk in r.iter_content(chunk_size=8192):
259
- f.write(chunk)
260
-
261
- def get_best_video_stream(streams):
262
- video_streams = [s for s in streams if s['mimeType'].startswith('video/')]
263
-
264
- # Try to get 720p or the highest quality below 720p
265
- preferred_stream = next((s for s in video_streams if s['qualityLabel'] == '720p'), None)
266
- if not preferred_stream:
267
- # If 720p is not available, get the highest quality below 720p
268
- below_720p = [s for s in video_streams if int(s['qualityLabel'][:-1]) < 720]
269
- preferred_stream = max(below_720p, key=lambda x: int(x['qualityLabel'][:-1])) if below_720p else None
270
-
271
- return preferred_stream
272
-
273
- def get_best_audio_stream(streams):
274
- audio_streams = [s for s in streams if s['mimeType'].startswith('audio/')]
275
- return max(audio_streams, key=lambda x: x['bitrate']) if audio_streams else None
276
-
277
  @app.get("/search-videos/")
278
  async def search_videos(
279
  query: str,
@@ -287,11 +195,11 @@ async def search_videos(
287
 
288
  @app.get("/get-audio/{video_id}")
289
  async def get_audio(video_id: str, api_key: str = Depends(verify_api_key)):
290
- result = get_youtube_audio(video_id)
291
  if not result['success']:
292
- raise HTTPException(status_code=500, detail=result['error'])
293
 
294
- s3_file_name = f"{video_id}.mp3"
295
  s3_url = upload_to_s3(result['temp_file_path'], s3_file_name)
296
 
297
  if s3_url:
@@ -300,62 +208,6 @@ async def get_audio(video_id: str, api_key: str = Depends(verify_api_key)):
300
  else:
301
  raise HTTPException(status_code=500, detail="Failed to upload audio to S3")
302
 
303
- @app.post("/get-video-streams")
304
- async def get_video_streams(video_id: str, api_key: str = Depends(verify_api_key)):
305
- try:
306
- # Create an InnerTube client
307
- client = InnerTube("WEB")
308
-
309
- # Fetch video info
310
- video_info = client.player(video_id)
311
-
312
- # Get the best video and audio streams
313
- streams = video_info["streamingData"]["adaptiveFormats"]
314
- video_stream = get_best_video_stream(streams)
315
- audio_stream = get_best_audio_stream(streams)
316
-
317
- if not video_stream or not audio_stream:
318
- raise HTTPException(status_code=404, detail="Could not find suitable video or audio stream")
319
-
320
- # Download video and audio
321
- with tempfile.NamedTemporaryFile(delete=False, suffix='.mp4') as video_file, \
322
- tempfile.NamedTemporaryFile(delete=False, suffix='.m4a') as audio_file:
323
-
324
- download_stream(video_stream['url'], video_file.name)
325
- download_stream(audio_stream['url'], audio_file.name)
326
-
327
- # Combine video and audio
328
- output_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp4')
329
-
330
- try:
331
- (
332
- ffmpeg
333
- .input(video_file.name)
334
- .input(audio_file.name)
335
- .output(output_file.name, vcodec='libx264', acodec='aac')
336
- .overwrite_output()
337
- .run(capture_stdout=True, capture_stderr=True)
338
- )
339
- except ffmpeg.Error as e:
340
- raise HTTPException(status_code=500, detail=f"Error combining video and audio: {e.stderr.decode()}")
341
-
342
- # Upload combined video to S3
343
- s3_file_name = f"{video_id}_combined.mp4"
344
- s3_url = upload_to_s3(output_file.name, s3_file_name)
345
-
346
- if s3_url:
347
- return {"video_url": s3_url}
348
- else:
349
- raise HTTPException(status_code=500, detail="Failed to upload video to S3")
350
-
351
- except Exception as e:
352
- raise HTTPException(status_code=500, detail=f"Error processing video: {str(e)}")
353
- finally:
354
- # Clean up temporary files
355
- for file in [video_file.name, audio_file.name, output_file.name]:
356
- if os.path.exists(file):
357
- os.unlink(file)
358
-
359
  @app.post("/extract-text/")
360
  async def extract_text(file: UploadFile, api_key: str = Depends(verify_api_key)):
361
  result = extract_text_from_document(file)
 
1
  import os
2
+ from fastapi import FastAPI, HTTPException, UploadFile, Depends, Query
 
 
3
  from fastapi.security import APIKeyHeader
4
  import aiohttp
5
  import asyncio
 
12
  from botocore.exceptions import NoCredentialsError
13
  from duckduckgo_search import DDGS
14
  from bs4 import BeautifulSoup
 
 
 
15
 
16
  app = FastAPI()
17
 
 
40
  if not all([API_KEY, S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_BUCKET, S3_REGION]):
41
  raise ValueError("Missing required environment variables")
42
 
 
 
 
 
 
 
43
 
44
  async def search_and_get_videos(query: str, num_videos: int = 2) -> List[Dict]:
45
  for instance in INVIDIOUS_INSTANCES:
 
65
  logger.error("All Invidious instances failed")
66
  return []
67
 
68
+ async def get_youtube_audio(video_id: str, max_retries: int = 3) -> Dict:
69
+ for instance in INVIDIOUS_INSTANCES:
70
+ for attempt in range(max_retries):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  try:
72
+ url = f"{instance}/api/v1/videos/{video_id}"
73
+
74
+ async with aiohttp.ClientSession() as session:
75
+ async with session.get(url) as response:
76
+ response.raise_for_status()
77
+ video_data = await response.json()
78
+
79
+ audio_format = next((format for format in video_data.get('adaptiveFormats', [])
80
+ if format.get('type', '').startswith('audio/mp4')), None)
81
+
82
+ if audio_format:
83
+ audio_url = audio_format.get('url')
84
+ if audio_url:
85
+ try:
86
+ async with session.get(audio_url) as audio_response:
87
+ audio_content = await audio_response.read()
88
+
89
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.m4a') as temp_file:
90
+ temp_file.write(audio_content)
91
+ temp_file_path = temp_file.name
92
+
93
+ return {'success': True, 'temp_file_path': temp_file_path}
94
+ except aiohttp.ServerDisconnectedError:
95
+ if attempt == max_retries - 1:
96
+ logger.error(f"Max retries reached for video ID {video_id} on {instance}")
97
+ break
98
+ await asyncio.sleep(1 * (attempt + 1))
99
+ continue
100
+
101
+ logger.warning(f"No suitable audio format found for video ID {video_id} on {instance}")
102
+ break
103
+ except aiohttp.ClientError as e:
104
+ logger.error(f"Network error fetching YouTube audio for video ID {video_id} on {instance}: {e}")
105
+ except json.JSONDecodeError:
106
+ logger.error(f"Error decoding JSON response for video ID {video_id} on {instance}")
107
+ except Exception as e:
108
+ logger.error(f"Unexpected error fetching YouTube audio for video ID {video_id} on {instance}: {e}")
109
+ if attempt == max_retries - 1:
110
+ break
111
+ await asyncio.sleep(1 * (attempt + 1))
112
 
113
+ return {'success': False, 'error': "Failed to fetch audio after multiple attempts on all instances"}
 
 
114
 
115
  def extract_text_from_document(file: UploadFile) -> dict:
116
  try:
 
142
  )
143
 
144
  try:
145
+ s3_client.upload_file(local_file, S3_BUCKET, s3_file)
 
 
 
 
 
146
  s3_url = f"https://{S3_BUCKET}.s3.{S3_REGION}.amazonaws.com/{s3_file}"
147
  return s3_url
148
  except NoCredentialsError:
 
182
  raise HTTPException(status_code=401, detail="Invalid API Key")
183
  return api_key
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  @app.get("/search-videos/")
186
  async def search_videos(
187
  query: str,
 
195
 
196
  @app.get("/get-audio/{video_id}")
197
  async def get_audio(video_id: str, api_key: str = Depends(verify_api_key)):
198
+ result = await get_youtube_audio(video_id)
199
  if not result['success']:
200
+ raise HTTPException(status_code=404, detail=result['error'])
201
 
202
+ s3_file_name = f"{video_id}.m4a"
203
  s3_url = upload_to_s3(result['temp_file_path'], s3_file_name)
204
 
205
  if s3_url:
 
208
  else:
209
  raise HTTPException(status_code=500, detail="Failed to upload audio to S3")
210
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  @app.post("/extract-text/")
212
  async def extract_text(file: UploadFile, api_key: str = Depends(verify_api_key)):
213
  result = extract_text_from_document(file)