yoon2566 commited on
Commit
453a2d3
ยท
verified ยท
1 Parent(s): fcb165f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +99 -60
app.py CHANGED
@@ -1,75 +1,114 @@
1
  import gradio as gr
2
- from pytube import YouTube
3
  import re
4
- import json
5
- from html import unescape
6
 
7
- def extract_video_id(url):
8
- """Extract video ID from YouTube URL"""
9
- patterns = [
10
- r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)',
11
- r'(?:youtube\.com\/shorts\/)([^&\n?]*)'
12
- ]
13
-
14
- for pattern in patterns:
15
- match = re.search(pattern, url)
16
- if match:
17
- return match.group(1)
18
- return None
19
 
20
- def get_transcript(url):
21
- """Get transcript from YouTube video using pytube"""
22
  try:
23
- # Create YouTube object
24
- yt = YouTube(url)
 
 
 
 
 
25
 
26
- # Get captions
27
- captions = yt.captions
28
-
29
- # Try to get Korean captions first, then English
30
- caption_track = None
31
- if 'ko' in captions:
32
- caption_track = captions['ko']
33
- elif 'a.ko' in captions: # auto-generated Korean
34
- caption_track = captions['a.ko']
35
- elif 'en' in captions:
36
- caption_track = captions['en']
37
- elif 'a.en' in captions: # auto-generated English
38
- caption_track = captions['a.en']
39
-
40
- if caption_track is None:
41
- return f"์ž๋ง‰์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.\n์ œ๋ชฉ: {yt.title}\n๊ธธ์ด: {yt.length}์ดˆ"
42
-
43
- # Get the XML captions
44
- xml_captions = caption_track.xml_captions
45
-
46
- # Parse the captions
47
- formatted_transcript = f"์ œ๋ชฉ: {yt.title}\n\n"
48
-
49
- # Simple XML parsing for timestamps and text
50
- caption_pattern = r'<text start="(\d+(?:\.\d+)?)"[^>]*>(.*?)</text>'
51
- matches = re.finditer(caption_pattern, xml_captions)
52
-
53
- for match in matches:
54
- start_time = float(match.group(1))
55
- text = unescape(match.group(2)).replace('\n', ' ')
56
 
57
- minutes = int(start_time // 60)
58
- seconds = int(start_time % 60)
 
59
 
60
- formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
61
-
62
- return formatted_transcript
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  except Exception as e:
65
- error_msg = str(e)
66
- if "age restricted" in error_msg.lower():
67
- return "์—ฐ๋ น ์ œํ•œ์ด ์žˆ๋Š” ์˜์ƒ์ž…๋‹ˆ๋‹ค."
68
- return f"์ž๋ง‰ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {error_msg}"
69
 
70
  # Create Gradio interface
71
  iface = gr.Interface(
72
- fn=get_transcript,
73
  inputs=gr.Textbox(
74
  label="YouTube URL",
75
  placeholder="https://www.youtube.com/watch?v=..."
@@ -78,7 +117,7 @@ iface = gr.Interface(
78
  label="์ถ”์ถœ๋œ ์Šคํฌ๋ฆฝํŠธ",
79
  lines=20
80
  ),
81
- title="YouTube ์ž๋ง‰ ์ถ”์ถœ๊ธฐ (pytube ๋ฒ„์ „)",
82
  description="YouTube ์˜์ƒ์˜ URL์„ ์ž…๋ ฅํ•˜๋ฉด ์ž๋ง‰์„ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค. (ํ•œ๊ตญ์–ด ์šฐ์„ , ์˜์–ด ์ฐจ์„ )",
83
  allow_flagging="never"
84
  )
 
1
  import gradio as gr
2
+ import yt_dlp
3
  import re
4
+ from datetime import timedelta
 
5
 
6
+ def format_timestamp(seconds):
7
+ """Convert seconds to MM:SS format"""
8
+ return str(timedelta(seconds=seconds)).split(':')[1:3]
 
 
 
 
 
 
 
 
 
9
 
10
+ def extract_transcript(url):
11
+ """Extract transcript from YouTube video using yt-dlp"""
12
  try:
13
+ ydl_opts = {
14
+ 'writesubtitles': True,
15
+ 'writeautomaticsub': True,
16
+ 'subtitleslangs': ['ko', 'en'],
17
+ 'skip_download': True,
18
+ 'quiet': True
19
+ }
20
 
21
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
22
+ # Get video info
23
+ info = ydl.extract_info(url, download=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ # Get available subtitles
26
+ subtitles = info.get('subtitles', {})
27
+ automatic_captions = info.get('automatic_captions', {})
28
 
29
+ # Try to get subtitles in preferred order
30
+ subtitle_text = None
31
+
32
+ # 1. Try manual Korean subtitles
33
+ if 'ko' in subtitles:
34
+ for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
35
+ for sub in subtitles['ko']:
36
+ if sub.get('ext') == fmt:
37
+ subtitle_text = ydl.write_debug_json(sub['url'])
38
+ break
39
+ if subtitle_text:
40
+ break
41
+
42
+ # 2. Try auto-generated Korean subtitles
43
+ if not subtitle_text and 'ko' in automatic_captions:
44
+ for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
45
+ for sub in automatic_captions['ko']:
46
+ if sub.get('ext') == fmt:
47
+ subtitle_text = ydl.write_debug_json(sub['url'])
48
+ break
49
+ if subtitle_text:
50
+ break
51
+
52
+ # 3. Try English subtitles
53
+ if not subtitle_text and 'en' in subtitles:
54
+ for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
55
+ for sub in subtitles['en']:
56
+ if sub.get('ext') == fmt:
57
+ subtitle_text = ydl.write_debug_json(sub['url'])
58
+ break
59
+ if subtitle_text:
60
+ break
61
+
62
+ # 4. Try auto-generated English subtitles
63
+ if not subtitle_text and 'en' in automatic_captions:
64
+ for fmt in ['vtt', 'srv1', 'srv2', 'srv3']:
65
+ for sub in automatic_captions['en']:
66
+ if sub.get('ext') == fmt:
67
+ subtitle_text = ydl.write_debug_json(sub['url'])
68
+ break
69
+ if subtitle_text:
70
+ break
71
+
72
+ if not subtitle_text:
73
+ return f"์ž๋ง‰์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.\n์ œ๋ชฉ: {info.get('title')}"
74
+
75
+ # Format output
76
+ formatted_output = f"์ œ๋ชฉ: {info.get('title')}\n\n"
77
+
78
+ # Parse WebVTT format
79
+ lines = subtitle_text.split('\n')
80
+ current_time = None
81
+ current_text = []
82
+
83
+ for line in lines:
84
+ # Time stamp line
85
+ if re.match(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', line):
86
+ if current_time and current_text:
87
+ mins, secs = current_time
88
+ formatted_output += f"[{mins}:{secs:02d}] {''.join(current_text)}\n"
89
+
90
+ start_time = line.split(' --> ')[0]
91
+ time_parts = start_time.split(':')
92
+ seconds = int(time_parts[1]) * 60 + float(time_parts[2].split('.')[0])
93
+ current_time = divmod(int(seconds), 60)
94
+ current_text = []
95
+ # Text line
96
+ elif line.strip() and not line.startswith('WEBVTT'):
97
+ current_text.append(line.strip() + ' ')
98
+
99
+ # Add last subtitle
100
+ if current_time and current_text:
101
+ mins, secs = current_time
102
+ formatted_output += f"[{mins}:{secs:02d}] {''.join(current_text)}\n"
103
+
104
+ return formatted_output
105
 
106
  except Exception as e:
107
+ return f"์ž๋ง‰ ์ถ”์ถœ ์ค‘ ์˜ค๋ฅ˜๊ฐ€ ๋ฐœ์ƒํ–ˆ์Šต๋‹ˆ๋‹ค: {str(e)}"
 
 
 
108
 
109
  # Create Gradio interface
110
  iface = gr.Interface(
111
+ fn=extract_transcript,
112
  inputs=gr.Textbox(
113
  label="YouTube URL",
114
  placeholder="https://www.youtube.com/watch?v=..."
 
117
  label="์ถ”์ถœ๋œ ์Šคํฌ๋ฆฝํŠธ",
118
  lines=20
119
  ),
120
+ title="YouTube ์ž๋ง‰ ์ถ”์ถœ๊ธฐ (yt-dlp ๋ฒ„์ „)",
121
  description="YouTube ์˜์ƒ์˜ URL์„ ์ž…๋ ฅํ•˜๋ฉด ์ž๋ง‰์„ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค. (ํ•œ๊ตญ์–ด ์šฐ์„ , ์˜์–ด ์ฐจ์„ )",
122
  allow_flagging="never"
123
  )