Commit
·
cd16225
1
Parent(s):
d8cc3d7
installed pydub and tried chunking the audio so i can transcribe it
Browse files
main.py
CHANGED
@@ -7,6 +7,7 @@ import requests # For making HTTP requests (to audio URLs)
|
|
7 |
from fastapi import FastAPI, HTTPException, Request # The web framework
|
8 |
from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access
|
9 |
from pydantic import BaseModel # For data validation
|
|
|
10 |
from huggingface_hub import InferenceClient # HF API client
|
11 |
from dotenv import load_dotenv # To load .env file locally
|
12 |
|
@@ -143,7 +144,8 @@ def download_audio_bytes(youtube_url: str) -> bytes:
|
|
143 |
print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB")
|
144 |
if not audio_bytes:
|
145 |
raise ValueError("Downloaded audio data is empty.")
|
146 |
-
|
|
|
147 |
|
148 |
except yt_dlp.utils.DownloadError as e:
|
149 |
print(f"ERROR: yt-dlp failed to download or process audio: {e}")
|
@@ -155,40 +157,64 @@ def download_audio_bytes(youtube_url: str) -> bytes:
|
|
155 |
print(f"ERROR: Unexpected error during audio download: {e}")
|
156 |
# Log the full traceback here in a real app: import traceback; traceback.print_exc()
|
157 |
raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
|
160 |
def transcribe_audio(audio_bytes: bytes) -> str:
|
161 |
"""
|
162 |
Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API.
|
163 |
"""
|
164 |
-
|
165 |
-
|
166 |
if not audio_bytes:
|
167 |
raise ValueError("Cannot transcribe empty audio data.")
|
168 |
|
169 |
print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...")
|
170 |
try:
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
},
|
177 |
-
data=audio_bytes,
|
178 |
-
timeout=120
|
179 |
)
|
180 |
-
|
181 |
-
response.raise_for_status()
|
182 |
-
result = response.json()
|
183 |
-
transcript = result.get("text", "").strip()
|
184 |
if not transcript:
|
185 |
print("Warning: Transcription result was empty.")
|
|
|
186 |
print("Transcription successful.")
|
187 |
return transcript
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}")
|
192 |
|
193 |
|
194 |
def generate_notes_from_transcript(transcript: str) -> str:
|
@@ -236,4 +262,64 @@ def generate_notes_from_transcript(transcript: str) -> str:
|
|
236 |
|
237 |
except Exception as e:
|
238 |
print(f"ERROR: Hugging Face LLM API call failed: {e}")
|
239 |
-
raise HTTPException(status_code=503, detail=f"Note generation service failed: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
from fastapi import FastAPI, HTTPException, Request # The web framework
|
8 |
from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access
|
9 |
from pydantic import BaseModel # For data validation
|
10 |
+
from pydub import AudioSegment # For splitting audio into chunks
|
11 |
from huggingface_hub import InferenceClient # HF API client
|
12 |
from dotenv import load_dotenv # To load .env file locally
|
13 |
|
|
|
144 |
print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB")
|
145 |
if not audio_bytes:
|
146 |
raise ValueError("Downloaded audio data is empty.")
|
147 |
+
audio_format = best_audio_format.get('ext', 'mp3') # e.g., "webm", "m4a", etc.
|
148 |
+
return audio_bytes, audio_format
|
149 |
|
150 |
except yt_dlp.utils.DownloadError as e:
|
151 |
print(f"ERROR: yt-dlp failed to download or process audio: {e}")
|
|
|
157 |
print(f"ERROR: Unexpected error during audio download: {e}")
|
158 |
# Log the full traceback here in a real app: import traceback; traceback.print_exc()
|
159 |
raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")
|
160 |
+
|
161 |
+
|
162 |
+
def chunk_audio(audio_bytes: bytes, audio_format: str = "mp3", chunk_length_ms: int = 30000) -> list[bytes]:
|
163 |
+
"""
|
164 |
+
Splits raw audio bytes into smaller chunks using the specified format.
|
165 |
+
|
166 |
+
Args:
|
167 |
+
audio_bytes (bytes): Raw audio data.
|
168 |
+
audio_format (str): The format of the audio (e.g., 'mp3', 'webm', 'm4a').
|
169 |
+
chunk_length_ms (int): Duration of each chunk in milliseconds.
|
170 |
+
|
171 |
+
Returns:
|
172 |
+
List[bytes]: List of audio chunks as bytes.
|
173 |
+
"""
|
174 |
+
try:
|
175 |
+
audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format=audio_format)
|
176 |
+
except Exception as e:
|
177 |
+
raise ValueError(f"Could not decode audio with format '{audio_format}': {e}")
|
178 |
+
|
179 |
+
chunks = []
|
180 |
+
total_length = len(audio)
|
181 |
+
|
182 |
+
for i in range(0, total_length, chunk_length_ms):
|
183 |
+
chunk = audio[i:i+chunk_length_ms]
|
184 |
+
chunk_buffer = io.BytesIO()
|
185 |
+
chunk.export(chunk_buffer, format="mp3") # Export to mp3 regardless of input format
|
186 |
+
chunks.append(chunk_buffer.getvalue())
|
187 |
+
|
188 |
+
return chunks
|
189 |
|
190 |
|
191 |
def transcribe_audio(audio_bytes: bytes) -> str:
|
192 |
"""
|
193 |
Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API.
|
194 |
"""
|
195 |
+
if not hf_inference:
|
196 |
+
raise HTTPException(status_code=503, detail="Transcription service client not initialized.")
|
197 |
if not audio_bytes:
|
198 |
raise ValueError("Cannot transcribe empty audio data.")
|
199 |
|
200 |
print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...")
|
201 |
try:
|
202 |
+
# Use the InferenceClient for ASR task
|
203 |
+
# It expects the raw audio bytes
|
204 |
+
transcript_result = hf_inference.automatic_speech_recognition(
|
205 |
+
audio=audio_bytes,
|
206 |
+
model=ASR_MODEL
|
|
|
|
|
|
|
207 |
)
|
208 |
+
transcript = transcript_result.get('text', '').strip() # Get text, default to '', remove whitespace
|
|
|
|
|
|
|
209 |
if not transcript:
|
210 |
print("Warning: Transcription result was empty.")
|
211 |
+
# Decide: return empty string or raise error? Let's return empty for now.
|
212 |
print("Transcription successful.")
|
213 |
return transcript
|
214 |
+
except Exception as e:
|
215 |
+
print(f"ERROR: Hugging Face ASR API call failed: {e}")
|
216 |
+
# Check for specific HF error types if possible
|
217 |
+
raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}") # 503 Service Unavailable
|
218 |
|
219 |
|
220 |
def generate_notes_from_transcript(transcript: str) -> str:
|
|
|
262 |
|
263 |
except Exception as e:
|
264 |
print(f"ERROR: Hugging Face LLM API call failed: {e}")
|
265 |
+
raise HTTPException(status_code=503, detail=f"Note generation service failed: {e}")
|
266 |
+
|
267 |
+
|
268 |
+
# --- API Endpoints ---
|
269 |
+
|
270 |
+
@app.get("/")
|
271 |
+
async def read_root():
|
272 |
+
""" Health check endpoint. Visit BASE_URL/ to see this. """
|
273 |
+
return {"message": "YouTube Notes API is running!"}
|
274 |
+
|
275 |
+
@app.get("/docs", include_in_schema=False)
|
276 |
+
async def custom_swagger_ui_html():
|
277 |
+
""" Access automatic API documentation via BASE_URL/docs """
|
278 |
+
from fastapi.openapi.docs import get_swagger_ui_html
|
279 |
+
return get_swagger_ui_html(openapi_url=app.openapi_url, title=app.title + " - Docs")
|
280 |
+
|
281 |
+
@app.get(app.openapi_url, include_in_schema=False)
|
282 |
+
async def get_open_api_endpoint():
|
283 |
+
""" Serves the OpenAPI schema """
|
284 |
+
from fastapi.openapi.utils import get_openapi
|
285 |
+
return get_openapi(title=app.title, version=app.version, routes=app.routes)
|
286 |
+
|
287 |
+
|
288 |
+
@app.post("/process-video/")
|
289 |
+
async def process_video(request: ProcessRequest):
|
290 |
+
"""
|
291 |
+
The main endpoint: receives a YouTube URL, processes it, and returns notes.
|
292 |
+
Accessible via POST request to BASE_URL/process-video/
|
293 |
+
"""
|
294 |
+
print(f"Received request for URL: {request.youtubeUrl}")
|
295 |
+
try:
|
296 |
+
# Step 1: Download Audio
|
297 |
+
audio_bytes, audio_format = download_audio_bytes(request.youtubeUrl)
|
298 |
+
|
299 |
+
# Step 2: Chunk and Transcribe Audio
|
300 |
+
audio_chunks = chunk_audio(audio_bytes, audio_format)
|
301 |
+
full_transcript = ""
|
302 |
+
for idx, chunk in enumerate(audio_chunks):
|
303 |
+
print(f"Transcribing chunk {idx + 1}/{len(audio_chunks)}...")
|
304 |
+
try:
|
305 |
+
partial = transcribe_audio(chunk)
|
306 |
+
full_transcript += partial + " "
|
307 |
+
except Exception as e:
|
308 |
+
print(f"Skipping chunk {idx + 1} due to error: {e}")
|
309 |
+
|
310 |
+
# Step 3: Generate Notes
|
311 |
+
notes = generate_notes_from_transcript(full_transcript.strip())
|
312 |
+
|
313 |
+
# Step 4: Return Notes
|
314 |
+
print("Processing complete. Returning notes.")
|
315 |
+
return {"notes": notes}
|
316 |
+
|
317 |
+
except HTTPException as http_exc:
|
318 |
+
# If an HTTPException was raised deliberately in helpers, re-raise it
|
319 |
+
print(f"HTTP Exception occurred: {http_exc.detail}")
|
320 |
+
raise http_exc
|
321 |
+
except Exception as e:
|
322 |
+
# Catch any other unexpected errors during the process
|
323 |
+
print(f"ERROR: Unhandled exception in /process-video/ endpoint: {e}")
|
324 |
+
# Log traceback here in production: traceback.print_exc()
|
325 |
+
raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}")
|