File size: 5,463 Bytes
de2cf2d
453a2d3
bc49d10
453a2d3
5a2df22
 
25549be
5a2df22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38b78b9
453a2d3
5a2df22
e2db192
5a2df22
 
 
453a2d3
 
 
 
 
 
 
8e48050
5a2df22
 
 
 
453a2d3
 
 
8e48050
453a2d3
 
 
8e48050
453a2d3
5a2df22
 
 
 
453a2d3
5a2df22
 
 
 
 
 
 
453a2d3
5a2df22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453a2d3
5a2df22
 
453a2d3
5a2df22
 
 
 
453a2d3
8e48050
de2cf2d
5a2df22
 
 
 
 
 
25549be
bc49d10
e2db192
453a2d3
bc49d10
 
 
 
 
 
 
 
5a2df22
 
 
 
 
 
bc49d10
e2db192
de2cf2d
bc49d10
e2db192
bc49d10
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
import gradio as gr
import yt_dlp
import re
from datetime import timedelta
import os
import browser_cookie3

def get_cookies():
    """Get YouTube cookies from browser"""
    try:
        # Try Chrome cookies first
        cookies = browser_cookie3.chrome(domain_name='.youtube.com')
    except:
        try:
            # Try Firefox cookies if Chrome fails
            cookies = browser_cookie3.firefox(domain_name='.youtube.com')
        except:
            return None
    
    # Convert cookies to Netscape format
    cookie_path = '/tmp/youtube.txt'
    with open(cookie_path, 'w') as f:
        for cookie in cookies:
            if cookie.domain == '.youtube.com':
                f.write(f"{cookie.domain}\tTRUE\t{cookie.path}\t"
                       f"{'TRUE' if cookie.secure else 'FALSE'}\t{cookie.expires}\t"
                       f"{cookie.name}\t{cookie.value}\n")
    return cookie_path

def extract_transcript(url):
    """Extract transcript from YouTube video using yt-dlp with cookies"""
    try:
        # Get cookies
        cookie_path = get_cookies()
        
        ydl_opts = {
            'writesubtitles': True,
            'writeautomaticsub': True,
            'subtitleslangs': ['ko', 'en'],
            'skip_download': True,
            'quiet': True
        }
        
        # Add cookies if available
        if cookie_path:
            ydl_opts['cookiefile'] = cookie_path
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            # Get video info
            info = ydl.extract_info(url, download=False)
            
            # Get available subtitles
            subtitles = info.get('subtitles', {})
            automatic_captions = info.get('automatic_captions', {})
            
            # Format output
            formatted_output = f"제λͺ©: {info.get('title', '제λͺ© μ—†μŒ')}\n\n"
            
            # Process subtitles
            subtitle_found = False
            
            # Priority order for subtitles
            subtitle_priorities = [
                ('ko', subtitles),           # Manual Korean
                ('ko', automatic_captions),  # Auto Korean
                ('en', subtitles),           # Manual English
                ('en', automatic_captions)   # Auto English
            ]
            
            for lang, sub_dict in subtitle_priorities:
                if lang in sub_dict and not subtitle_found:
                    subs = sub_dict[lang]
                    if isinstance(subs, list) and subs:
                        subtitle_found = True
                        
                        # Process each subtitle entry
                        for entry in subs:
                            if 'src' in entry:  # JSON format
                                lines = entry['src'].split('\n')
                                current_time = None
                                current_text = []
                                
                                for line in lines:
                                    # Time stamp line
                                    if re.match(r'\d{2}:\d{2}:\d{2}', line):
                                        if current_time and current_text:
                                            formatted_output += f"[{current_time}] {''.join(current_text)}\n"
                                        
                                        time_parts = line.split(':')
                                        current_time = f"{time_parts[1]}:{time_parts[2].split('.')[0]}"
                                        current_text = []
                                    # Text line
                                    elif line.strip() and not line.startswith('WEBVTT'):
                                        current_text.append(line.strip() + ' ')
                                
                                # Add last subtitle
                                if current_time and current_text:
                                    formatted_output += f"[{current_time}] {''.join(current_text)}\n"
                                break
            
            if not subtitle_found:
                return "μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€. (μžλ™ 생성 μžλ§‰ 포함)"
            
            # Clean up cookie file
            if cookie_path and os.path.exists(cookie_path):
                os.remove(cookie_path)
                
            return formatted_output

    except Exception as e:
        error_msg = str(e)
        if "Sign in to confirm your age" in error_msg:
            return "μ—°λ Ή μ œν•œμ΄ μžˆλŠ” μ˜μƒμž…λ‹ˆλ‹€."
        elif "confirm you're not a bot" in error_msg:
            return "YouTubeκ°€ 봇 방지λ₯Ό μœ„ν•΄ 인증을 μš”κ΅¬ν•©λ‹ˆλ‹€. μž μ‹œ ν›„ λ‹€μ‹œ μ‹œλ„ν•΄μ£Όμ„Έμš”."
        return f"μžλ§‰ μΆ”μΆœ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {error_msg}"

# Create Gradio interface
iface = gr.Interface(
    fn=extract_transcript,
    inputs=gr.Textbox(
        label="YouTube URL",
        placeholder="https://www.youtube.com/watch?v=..."
    ),
    outputs=gr.Textbox(
        label="μΆ”μΆœλœ 슀크립트",
        lines=20
    ),
    title="YouTube μžλ§‰ μΆ”μΆœκΈ°",
    description="""
    YouTube μ˜μƒμ˜ URL을 μž…λ ₯ν•˜λ©΄ μžλ§‰μ„ μΆ”μΆœν•©λ‹ˆλ‹€.
    - ν•œκ΅­μ–΄ μžλ§‰ μš°μ„  (μˆ˜λ™ > μžλ™)
    - μ˜μ–΄ μžλ§‰ μ°¨μ„  (μˆ˜λ™ > μžλ™)
    """,
    allow_flagging="never"
)

# Launch the app
if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0")