TherapyNote / utils /youtube.py
abagherp's picture
Upload folder using huggingface_hub
6830eb0 verified
raw
history blame contribute delete
855 Bytes
from __future__ import annotations
from youtube_transcript_api import YouTubeTranscriptApi
from urllib.parse import urlparse, parse_qs
def extract_youtube_video_id(url: str) -> str:
"""Extract the video_id from a YouTube URL."""
parsed = urlparse(url)
if parsed.hostname in ('www.youtube.com', 'youtube.com'):
return parse_qs(parsed.query)['v'][0]
elif parsed.hostname == 'youtu.be':
return parsed.path.lstrip('/')
raise ValueError("Invalid YouTube URL")
def download_transcript(url: str) -> str:
"""Download the YouTube transcript as a string."""
video_id = extract_youtube_video_id(url)
print(f"Downloading transcript for video ID: {video_id}")
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
full_text = " ".join([item['text'] for item in transcript_list])
return full_text