Spaces:
Sleeping
Sleeping
from __future__ import annotations | |
from youtube_transcript_api import YouTubeTranscriptApi | |
from urllib.parse import urlparse, parse_qs | |
def extract_youtube_video_id(url: str) -> str: | |
"""Extract the video_id from a YouTube URL.""" | |
parsed = urlparse(url) | |
if parsed.hostname in ('www.youtube.com', 'youtube.com'): | |
return parse_qs(parsed.query)['v'][0] | |
elif parsed.hostname == 'youtu.be': | |
return parsed.path.lstrip('/') | |
raise ValueError("Invalid YouTube URL") | |
def download_transcript(url: str) -> str: | |
"""Download the YouTube transcript as a string.""" | |
video_id = extract_youtube_video_id(url) | |
print(f"Downloading transcript for video ID: {video_id}") | |
transcript_list = YouTubeTranscriptApi.get_transcript(video_id) | |
full_text = " ".join([item['text'] for item in transcript_list]) | |
return full_text |