OmarHusseinZaki commited on
Commit
cd16225
·
1 Parent(s): d8cc3d7

installed pydub and tried chunking the audio so i can transcribe it

Browse files
Files changed (1) hide show
  1. main.py +106 -20
main.py CHANGED
@@ -7,6 +7,7 @@ import requests # For making HTTP requests (to audio URLs)
7
  from fastapi import FastAPI, HTTPException, Request # The web framework
8
  from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access
9
  from pydantic import BaseModel # For data validation
 
10
  from huggingface_hub import InferenceClient # HF API client
11
  from dotenv import load_dotenv # To load .env file locally
12
 
@@ -143,7 +144,8 @@ def download_audio_bytes(youtube_url: str) -> bytes:
143
  print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB")
144
  if not audio_bytes:
145
  raise ValueError("Downloaded audio data is empty.")
146
- return audio_bytes
 
147
 
148
  except yt_dlp.utils.DownloadError as e:
149
  print(f"ERROR: yt-dlp failed to download or process audio: {e}")
@@ -155,40 +157,64 @@ def download_audio_bytes(youtube_url: str) -> bytes:
155
  print(f"ERROR: Unexpected error during audio download: {e}")
156
  # Log the full traceback here in a real app: import traceback; traceback.print_exc()
157
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
 
159
 
160
  def transcribe_audio(audio_bytes: bytes) -> str:
161
  """
162
  Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API.
163
  """
164
- # if not hf_inference:
165
- # raise HTTPException(status_code=503, detail="Transcription service client not initialized.")
166
  if not audio_bytes:
167
  raise ValueError("Cannot transcribe empty audio data.")
168
 
169
  print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...")
170
  try:
171
- response = requests.post(
172
- url=f"https://api-inference.huggingface.co/models/{ASR_MODEL}?return_timestamps=true",
173
- headers={
174
- "Authorization": f"Bearer {HF_API_KEY}",
175
- "Content-Type": "audio/mpeg" # Use 'audio/wav' if you're sending WAV data
176
- },
177
- data=audio_bytes,
178
- timeout=120
179
  )
180
-
181
- response.raise_for_status()
182
- result = response.json()
183
- transcript = result.get("text", "").strip()
184
  if not transcript:
185
  print("Warning: Transcription result was empty.")
 
186
  print("Transcription successful.")
187
  return transcript
188
-
189
- except requests.exceptions.RequestException as e:
190
- print(f"ERROR: Request to Hugging Face failed: {e}")
191
- raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}")
192
 
193
 
194
  def generate_notes_from_transcript(transcript: str) -> str:
@@ -236,4 +262,64 @@ def generate_notes_from_transcript(transcript: str) -> str:
236
 
237
  except Exception as e:
238
  print(f"ERROR: Hugging Face LLM API call failed: {e}")
239
- raise HTTPException(status_code=503, detail=f"Note generation service failed: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  from fastapi import FastAPI, HTTPException, Request # The web framework
8
  from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access
9
  from pydantic import BaseModel # For data validation
10
+ from pydub import AudioSegment # For splitting audio into chunks
11
  from huggingface_hub import InferenceClient # HF API client
12
  from dotenv import load_dotenv # To load .env file locally
13
 
 
144
  print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB")
145
  if not audio_bytes:
146
  raise ValueError("Downloaded audio data is empty.")
147
+ audio_format = best_audio_format.get('ext', 'mp3') # e.g., "webm", "m4a", etc.
148
+ return audio_bytes, audio_format
149
 
150
  except yt_dlp.utils.DownloadError as e:
151
  print(f"ERROR: yt-dlp failed to download or process audio: {e}")
 
157
  print(f"ERROR: Unexpected error during audio download: {e}")
158
  # Log the full traceback here in a real app: import traceback; traceback.print_exc()
159
  raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")
160
+
161
+
162
+ def chunk_audio(audio_bytes: bytes, audio_format: str = "mp3", chunk_length_ms: int = 30000) -> list[bytes]:
163
+ """
164
+ Splits raw audio bytes into smaller chunks using the specified format.
165
+
166
+ Args:
167
+ audio_bytes (bytes): Raw audio data.
168
+ audio_format (str): The format of the audio (e.g., 'mp3', 'webm', 'm4a').
169
+ chunk_length_ms (int): Duration of each chunk in milliseconds.
170
+
171
+ Returns:
172
+ List[bytes]: List of audio chunks as bytes.
173
+ """
174
+ try:
175
+ audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format=audio_format)
176
+ except Exception as e:
177
+ raise ValueError(f"Could not decode audio with format '{audio_format}': {e}")
178
+
179
+ chunks = []
180
+ total_length = len(audio)
181
+
182
+ for i in range(0, total_length, chunk_length_ms):
183
+ chunk = audio[i:i+chunk_length_ms]
184
+ chunk_buffer = io.BytesIO()
185
+ chunk.export(chunk_buffer, format="mp3") # Export to mp3 regardless of input format
186
+ chunks.append(chunk_buffer.getvalue())
187
+
188
+ return chunks
189
 
190
 
191
  def transcribe_audio(audio_bytes: bytes) -> str:
192
  """
193
  Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API.
194
  """
195
+ if not hf_inference:
196
+ raise HTTPException(status_code=503, detail="Transcription service client not initialized.")
197
  if not audio_bytes:
198
  raise ValueError("Cannot transcribe empty audio data.")
199
 
200
  print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...")
201
  try:
202
+ # Use the InferenceClient for ASR task
203
+ # It expects the raw audio bytes
204
+ transcript_result = hf_inference.automatic_speech_recognition(
205
+ audio=audio_bytes,
206
+ model=ASR_MODEL
 
 
 
207
  )
208
+ transcript = transcript_result.get('text', '').strip() # Get text, default to '', remove whitespace
 
 
 
209
  if not transcript:
210
  print("Warning: Transcription result was empty.")
211
+ # Decide: return empty string or raise error? Let's return empty for now.
212
  print("Transcription successful.")
213
  return transcript
214
+ except Exception as e:
215
+ print(f"ERROR: Hugging Face ASR API call failed: {e}")
216
+ # Check for specific HF error types if possible
217
+ raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}") # 503 Service Unavailable
218
 
219
 
220
  def generate_notes_from_transcript(transcript: str) -> str:
 
262
 
263
  except Exception as e:
264
  print(f"ERROR: Hugging Face LLM API call failed: {e}")
265
+ raise HTTPException(status_code=503, detail=f"Note generation service failed: {e}")
266
+
267
+
268
+ # --- API Endpoints ---
269
+
270
+ @app.get("/")
271
+ async def read_root():
272
+ """ Health check endpoint. Visit BASE_URL/ to see this. """
273
+ return {"message": "YouTube Notes API is running!"}
274
+
275
+ @app.get("/docs", include_in_schema=False)
276
+ async def custom_swagger_ui_html():
277
+ """ Access automatic API documentation via BASE_URL/docs """
278
+ from fastapi.openapi.docs import get_swagger_ui_html
279
+ return get_swagger_ui_html(openapi_url=app.openapi_url, title=app.title + " - Docs")
280
+
281
+ @app.get(app.openapi_url, include_in_schema=False)
282
+ async def get_open_api_endpoint():
283
+ """ Serves the OpenAPI schema """
284
+ from fastapi.openapi.utils import get_openapi
285
+ return get_openapi(title=app.title, version=app.version, routes=app.routes)
286
+
287
+
288
+ @app.post("/process-video/")
289
+ async def process_video(request: ProcessRequest):
290
+ """
291
+ The main endpoint: receives a YouTube URL, processes it, and returns notes.
292
+ Accessible via POST request to BASE_URL/process-video/
293
+ """
294
+ print(f"Received request for URL: {request.youtubeUrl}")
295
+ try:
296
+ # Step 1: Download Audio
297
+ audio_bytes, audio_format = download_audio_bytes(request.youtubeUrl)
298
+
299
+ # Step 2: Chunk and Transcribe Audio
300
+ audio_chunks = chunk_audio(audio_bytes, audio_format)
301
+ full_transcript = ""
302
+ for idx, chunk in enumerate(audio_chunks):
303
+ print(f"Transcribing chunk {idx + 1}/{len(audio_chunks)}...")
304
+ try:
305
+ partial = transcribe_audio(chunk)
306
+ full_transcript += partial + " "
307
+ except Exception as e:
308
+ print(f"Skipping chunk {idx + 1} due to error: {e}")
309
+
310
+ # Step 3: Generate Notes
311
+ notes = generate_notes_from_transcript(full_transcript.strip())
312
+
313
+ # Step 4: Return Notes
314
+ print("Processing complete. Returning notes.")
315
+ return {"notes": notes}
316
+
317
+ except HTTPException as http_exc:
318
+ # If an HTTPException was raised deliberately in helpers, re-raise it
319
+ print(f"HTTP Exception occurred: {http_exc.detail}")
320
+ raise http_exc
321
+ except Exception as e:
322
+ # Catch any other unexpected errors during the process
323
+ print(f"ERROR: Unhandled exception in /process-video/ endpoint: {e}")
324
+ # Log traceback here in production: traceback.print_exc()
325
+ raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}")