Spaces:

OmarHusseinZaki
/

vid-to-notes-backend

Sleeping

App Files Files Community

OmarHusseinZaki commited on Apr 18

Commit

cd16225

1 Parent(s): d8cc3d7

installed pydub and tried chunking the audio so i can transcribe it

Browse files

Files changed (1) hide show

main.py +106 -20

main.py CHANGED Viewed

@@ -7,6 +7,7 @@ import requests               # For making HTTP requests (to audio URLs)
 from fastapi import FastAPI, HTTPException, Request # The web framework
 from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access
 from pydantic import BaseModel # For data validation
 from huggingface_hub import InferenceClient # HF API client
 from dotenv import load_dotenv # To load .env file locally
@@ -143,7 +144,8 @@ def download_audio_bytes(youtube_url: str) -> bytes:
              print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB")
              if not audio_bytes:
                  raise ValueError("Downloaded audio data is empty.")
-             return audio_bytes
     except yt_dlp.utils.DownloadError as e:
         print(f"ERROR: yt-dlp failed to download or process audio: {e}")
@@ -155,40 +157,64 @@ def download_audio_bytes(youtube_url: str) -> bytes:
         print(f"ERROR: Unexpected error during audio download: {e}")
         # Log the full traceback here in a real app: import traceback; traceback.print_exc()
         raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")
 def transcribe_audio(audio_bytes: bytes) -> str:
     """
     Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API.
     """
-    # if not hf_inference:
-    #     raise HTTPException(status_code=503, detail="Transcription service client not initialized.")
     if not audio_bytes:
         raise ValueError("Cannot transcribe empty audio data.")
     print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...")
     try:
-        response = requests.post(
-            url=f"https://api-inference.huggingface.co/models/{ASR_MODEL}?return_timestamps=true",
-            headers={
-                "Authorization": f"Bearer {HF_API_KEY}",
-                "Content-Type": "audio/mpeg"  # Use 'audio/wav' if you're sending WAV data
-            },
-            data=audio_bytes,
-            timeout=120
         )
-        response.raise_for_status()
-        result = response.json()
-        transcript = result.get("text", "").strip()
         if not transcript:
             print("Warning: Transcription result was empty.")
         print("Transcription successful.")
         return transcript
-    except requests.exceptions.RequestException as e:
-        print(f"ERROR: Request to Hugging Face failed: {e}")
-        raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}")
 def generate_notes_from_transcript(transcript: str) -> str:
@@ -236,4 +262,64 @@ def generate_notes_from_transcript(transcript: str) -> str:
     except Exception as e:
         print(f"ERROR: Hugging Face LLM API call failed: {e}")
-        raise HTTPException(status_code=503, detail=f"Note generation service failed: {e}")

 from fastapi import FastAPI, HTTPException, Request # The web framework
 from fastapi.middleware.cors import CORSMiddleware # For allowing frontend access
 from pydantic import BaseModel # For data validation
+from pydub import AudioSegment  # For splitting audio into chunks
 from huggingface_hub import InferenceClient # HF API client
 from dotenv import load_dotenv # To load .env file locally
              print(f"Audio downloaded successfully: {len(audio_bytes) / (1024*1024):.2f} MB")
              if not audio_bytes:
                  raise ValueError("Downloaded audio data is empty.")
+             audio_format = best_audio_format.get('ext', 'mp3')  # e.g., "webm", "m4a", etc.
+             return audio_bytes, audio_format
     except yt_dlp.utils.DownloadError as e:
         print(f"ERROR: yt-dlp failed to download or process audio: {e}")
         print(f"ERROR: Unexpected error during audio download: {e}")
         # Log the full traceback here in a real app: import traceback; traceback.print_exc()
         raise HTTPException(status_code=500, detail=f"An unexpected error occurred during audio processing: {e}")
+def chunk_audio(audio_bytes: bytes, audio_format: str = "mp3", chunk_length_ms: int = 30000) -> list[bytes]:
+    """
+    Splits raw audio bytes into smaller chunks using the specified format.
+    Args:
+        audio_bytes (bytes): Raw audio data.
+        audio_format (str): The format of the audio (e.g., 'mp3', 'webm', 'm4a').
+        chunk_length_ms (int): Duration of each chunk in milliseconds.
+    Returns:
+        List[bytes]: List of audio chunks as bytes.
+    """
+    try:
+        audio = AudioSegment.from_file(io.BytesIO(audio_bytes), format=audio_format)
+    except Exception as e:
+        raise ValueError(f"Could not decode audio with format '{audio_format}': {e}")
+    chunks = []
+    total_length = len(audio)
+    for i in range(0, total_length, chunk_length_ms):
+        chunk = audio[i:i+chunk_length_ms]
+        chunk_buffer = io.BytesIO()
+        chunk.export(chunk_buffer, format="mp3")  # Export to mp3 regardless of input format
+        chunks.append(chunk_buffer.getvalue())
+    return chunks
 def transcribe_audio(audio_bytes: bytes) -> str:
     """
     Sends audio bytes to the Hugging Face ASR (Automatic Speech Recognition) API.
     """
+    if not hf_inference:
+        raise HTTPException(status_code=503, detail="Transcription service client not initialized.")
     if not audio_bytes:
         raise ValueError("Cannot transcribe empty audio data.")
     print(f"Transcribing {len(audio_bytes) / (1024*1024):.2f} MB using {ASR_MODEL}...")
     try:
+        # Use the InferenceClient for ASR task
+        # It expects the raw audio bytes
+        transcript_result = hf_inference.automatic_speech_recognition(
+            audio=audio_bytes,
+            model=ASR_MODEL
         )
+        transcript = transcript_result.get('text', '').strip() # Get text, default to '', remove whitespace
         if not transcript:
             print("Warning: Transcription result was empty.")
+            # Decide: return empty string or raise error? Let's return empty for now.
         print("Transcription successful.")
         return transcript
+    except Exception as e:
+        print(f"ERROR: Hugging Face ASR API call failed: {e}")
+        # Check for specific HF error types if possible
+        raise HTTPException(status_code=503, detail=f"Transcription service failed: {e}") # 503 Service Unavailable
 def generate_notes_from_transcript(transcript: str) -> str:
     except Exception as e:
         print(f"ERROR: Hugging Face LLM API call failed: {e}")
+        raise HTTPException(status_code=503, detail=f"Note generation service failed: {e}")
+# --- API Endpoints ---
+@app.get("/")
+async def read_root():
+    """ Health check endpoint. Visit BASE_URL/ to see this. """
+    return {"message": "YouTube Notes API is running!"}
+@app.get("/docs", include_in_schema=False)
+async def custom_swagger_ui_html():
+    """ Access automatic API documentation via BASE_URL/docs """
+    from fastapi.openapi.docs import get_swagger_ui_html
+    return get_swagger_ui_html(openapi_url=app.openapi_url, title=app.title + " - Docs")
+@app.get(app.openapi_url, include_in_schema=False)
+async def get_open_api_endpoint():
+     """ Serves the OpenAPI schema """
+     from fastapi.openapi.utils import get_openapi
+     return get_openapi(title=app.title, version=app.version, routes=app.routes)
+@app.post("/process-video/")
+async def process_video(request: ProcessRequest):
+    """
+    The main endpoint: receives a YouTube URL, processes it, and returns notes.
+    Accessible via POST request to BASE_URL/process-video/
+    """
+    print(f"Received request for URL: {request.youtubeUrl}")
+    try:
+        # Step 1: Download Audio
+        audio_bytes, audio_format = download_audio_bytes(request.youtubeUrl)
+        # Step 2: Chunk and Transcribe Audio
+        audio_chunks = chunk_audio(audio_bytes, audio_format)
+        full_transcript = ""
+        for idx, chunk in enumerate(audio_chunks):
+            print(f"Transcribing chunk {idx + 1}/{len(audio_chunks)}...")
+            try:
+                partial = transcribe_audio(chunk)
+                full_transcript += partial + " "
+            except Exception as e:
+                print(f"Skipping chunk {idx + 1} due to error: {e}")
+        # Step 3: Generate Notes
+        notes = generate_notes_from_transcript(full_transcript.strip())
+        # Step 4: Return Notes
+        print("Processing complete. Returning notes.")
+        return {"notes": notes}
+    except HTTPException as http_exc:
+        # If an HTTPException was raised deliberately in helpers, re-raise it
+        print(f"HTTP Exception occurred: {http_exc.detail}")
+        raise http_exc
+    except Exception as e:
+        # Catch any other unexpected errors during the process
+        print(f"ERROR: Unhandled exception in /process-video/ endpoint: {e}")
+        # Log traceback here in production: traceback.print_exc()
+        raise HTTPException(status_code=500, detail=f"An internal server error occurred: {e}")