yoon2566 commited on
Commit
bc49d10
Β·
verified Β·
1 Parent(s): 8e48050

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -102
app.py CHANGED
@@ -1,126 +1,99 @@
1
  import gradio as gr
2
  from youtube_transcript_api import YouTubeTranscriptApi
3
- from youtube_transcript_api._errors import NoTranscriptFound, TranscriptsDisabled, NoTranscriptAvailable
4
- import requests
5
- import json
6
- import os
7
-
8
- def get_video_info(video_id):
9
- """YouTube μ˜μƒ 정보λ₯Ό κ°€μ Έμ˜€λŠ” ν•¨μˆ˜"""
10
- try:
11
- response = requests.get(f'https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v={video_id}&format=json')
12
- return response.json()
13
- except:
14
- return None
15
 
16
  def extract_video_id(url):
17
- """YouTube URLμ—μ„œ λΉ„λ””μ˜€ IDλ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜"""
18
- video_id = None
19
- try:
20
- if 'youtube.com/watch?v=' in url:
21
- video_id = url.split('youtube.com/watch?v=')[1].split('&')[0][:11]
22
- elif 'youtu.be/' in url:
23
- video_id = url.split('youtu.be/')[1].split('?')[0][:11]
24
- return video_id
25
- except:
26
- return None
27
-
28
- def get_available_transcripts(video_id):
29
- """μ‚¬μš© κ°€λŠ₯ν•œ λͺ¨λ“  μžλ§‰ λͺ©λ‘μ„ κ°€μ Έμ˜€λŠ” ν•¨μˆ˜"""
30
- try:
31
- transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
32
- return transcript_list
33
- except:
34
- return None
35
 
36
  def get_transcript(url):
37
- """YouTube μ˜μƒμ˜ 슀크립트λ₯Ό μΆ”μΆœν•˜λŠ” ν•¨μˆ˜"""
38
  try:
39
- # λΉ„λ””μ˜€ ID μΆ”μΆœ
40
  video_id = extract_video_id(url)
41
  if not video_id:
42
- return "μ˜¬λ°”λ₯Έ YouTube URL을 μž…λ ₯ν•΄μ£Όμ„Έμš”."
43
-
44
- # μ˜μƒ 정보 확인
45
- video_info = get_video_info(video_id)
46
- if not video_info:
47
- return "μ˜μƒ 정보λ₯Ό κ°€μ Έμ˜¬ 수 μ—†μŠ΅λ‹ˆλ‹€. URL을 ν™•μΈν•΄μ£Όμ„Έμš”."
48
 
49
- # μžλ§‰ μΆ”μΆœ μ‹œλ„
50
- transcript_list = None
51
- transcript_data = None
52
 
 
 
53
  try:
54
- # λͺ¨λ“  κ°€λŠ₯ν•œ μžλ§‰ λͺ©λ‘ κ°€μ Έμ˜€κΈ°
55
- transcript_list = get_available_transcripts(video_id)
56
-
57
- if transcript_list:
58
- # 1. μˆ˜λ™ ν•œκ΅­μ–΄ μžλ§‰ μ‹œλ„
 
 
59
  try:
60
- transcript = transcript_list.find_manually_created_transcript(['ko'])
61
- transcript_data = transcript.fetch()
62
  except:
63
- # 2. μžλ™ μƒμ„±λœ ν•œκ΅­μ–΄ μžλ§‰ μ‹œλ„
64
- try:
65
- transcript = transcript_list.find_generated_transcript(['ko'])
66
- transcript_data = transcript.fetch()
67
- except:
68
- # 3. μ˜μ–΄ μžλ§‰ μ‹œλ„ (μˆ˜λ™ λ˜λŠ” μžλ™)
69
- try:
70
- transcript = transcript_list.find_transcript(['en'])
71
- transcript_data = transcript.fetch()
72
- except:
73
- return "μ§€μ›λ˜λŠ” μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
74
-
75
- if not transcript_data:
76
- # 직접 API둜 μ‹œλ„
77
- transcript_data = YouTubeTranscriptApi.get_transcript(video_id, languages=['ko', 'en'])
78
 
79
- # 전체 슀크립트 ν…μŠ€νŠΈ ꡬ성
80
- full_transcript = f"제λͺ©: {video_info.get('title', '제λͺ© μ—†μŒ')}\n\n"
81
- for transcript in transcript_data:
82
- text = transcript['text']
83
- timestamp = transcript['start']
84
- minutes = int(timestamp // 60)
85
- seconds = int(timestamp % 60)
86
- full_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
87
 
88
- return full_transcript
89
-
90
- except NoTranscriptAvailable:
91
- return "이 μ˜μƒμ—λŠ” μžλ§‰μ΄ μ—†μŠ΅λ‹ˆλ‹€."
92
- except TranscriptsDisabled:
93
- return "이 μ˜μƒμ€ μžλ§‰μ΄ λΉ„ν™œμ„±ν™”λ˜μ–΄ μžˆμŠ΅λ‹ˆλ‹€."
94
- except Exception as e:
95
- return f"μžλ§‰ μΆ”μΆœ 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
96
 
97
  except Exception as e:
98
- return f"처리 쀑 였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- # Gradio μΈν„°νŽ˜μ΄μŠ€ 생성
101
  iface = gr.Interface(
102
  fn=get_transcript,
103
- inputs=[
104
- gr.Textbox(label="YouTube URL을 μž…λ ₯ν•˜μ„Έμš”", placeholder="https://www.youtube.com/watch?v=...")
105
- ],
106
- outputs=gr.Textbox(label="μΆ”μΆœλœ 슀크립트", lines=20),
107
- title="YouTube 슀크립트 μΆ”μΆœκΈ°",
108
- description="""
109
- YouTube μ˜μƒμ˜ URL을 μž…λ ₯ν•˜λ©΄ μžλ™μœΌλ‘œ 슀크립트λ₯Ό μΆ”μΆœν•©λ‹ˆλ‹€.
110
- - ν•œκ΅­μ–΄ μžλ§‰ (μˆ˜λ™/μžλ™)
111
- - μ˜μ–΄ μžλ§‰ (μˆ˜λ™/μžλ™)
112
- 을 순차적으둜 μ‹œλ„ν•©λ‹ˆλ‹€.
113
- """,
114
- examples=[
115
- ["https://www.youtube.com/watch?v=example1"],
116
- ["https://youtu.be/example2"]
117
- ]
118
  )
119
 
120
- # ν™˜κ²½λ³€μˆ˜ μ„€μ •
121
- os.environ['GRADIO_SERVER_NAME'] = "0.0.0.0"
122
- os.environ['GRADIO_SERVER_PORT'] = "7860"
123
-
124
- # μ• ν”Œλ¦¬μΌ€μ΄μ…˜ μ‹€ν–‰
125
  if __name__ == "__main__":
126
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
1
  import gradio as gr
2
  from youtube_transcript_api import YouTubeTranscriptApi
3
+ import re
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  def extract_video_id(url):
6
+ """Extract video ID from YouTube URL"""
7
+ patterns = [
8
+ r'(?:youtube\.com\/watch\?v=|youtu.be\/|youtube.com\/embed\/)([^&\n?]*)',
9
+ r'(?:youtube\.com\/shorts\/)([^&\n?]*)'
10
+ ]
11
+
12
+ for pattern in patterns:
13
+ match = re.search(pattern, url)
14
+ if match:
15
+ return match.group(1)
16
+ return None
 
 
 
 
 
 
 
17
 
18
  def get_transcript(url):
19
+ """Get transcript from YouTube video"""
20
  try:
21
+ # 1. Extract video ID
22
  video_id = extract_video_id(url)
23
  if not video_id:
24
+ return "μœ νš¨ν•œ YouTube URL이 μ•„λ‹™λ‹ˆλ‹€. λ‹€μ‹œ ν™•μΈν•΄μ£Όμ„Έμš”."
 
 
 
 
 
25
 
26
+ # 2. Get transcript list
27
+ transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
 
28
 
29
+ # 3. Try to get transcript in preferred order
30
+ transcript = None
31
  try:
32
+ # Try manual Korean transcript first
33
+ transcript = transcript_list.find_manually_created_transcript(['ko'])
34
+ except:
35
+ try:
36
+ # Try auto-generated Korean transcript
37
+ transcript = transcript_list.find_generated_transcript(['ko'])
38
+ except:
39
  try:
40
+ # Try English transcript
41
+ transcript = transcript_list.find_transcript(['en'])
42
  except:
43
+ # Try any available transcript
44
+ transcript = transcript_list.find_transcript(['ko', 'en', 'ja', 'zh-Hans'])
45
+
46
+ # 4. Fetch and format transcript
47
+ if transcript:
48
+ transcript_data = transcript.fetch()
49
+ formatted_transcript = ""
 
 
 
 
 
 
 
 
50
 
51
+ for entry in transcript_data:
52
+ time = int(entry['start'])
53
+ minutes = time // 60
54
+ seconds = time % 60
55
+ text = entry['text'].replace('\n', ' ')
56
+ formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
 
 
57
 
58
+ return formatted_transcript if formatted_transcript else "μžλ§‰μ„ μΆ”μΆœν•  수 μ—†μŠ΅λ‹ˆλ‹€."
59
+ else:
60
+ return "μžλ§‰μ„ 찾을 수 μ—†μŠ΅λ‹ˆλ‹€."
 
 
 
 
 
61
 
62
  except Exception as e:
63
+ error_msg = str(e)
64
+ if "Subtitles are disabled" in error_msg:
65
+ # Try alternative method
66
+ try:
67
+ transcript_data = YouTubeTranscriptApi.get_transcript(video_id, languages=['ko', 'en'])
68
+ formatted_transcript = ""
69
+ for entry in transcript_data:
70
+ time = int(entry['start'])
71
+ minutes = time // 60
72
+ seconds = time % 60
73
+ text = entry['text'].replace('\n', ' ')
74
+ formatted_transcript += f"[{minutes:02d}:{seconds:02d}] {text}\n"
75
+ return formatted_transcript
76
+ except:
77
+ return "이 μ˜μƒμ—μ„œ μžλ§‰μ„ μΆ”μΆœν•  수 μ—†μŠ΅λ‹ˆλ‹€."
78
+ else:
79
+ return f"였λ₯˜κ°€ λ°œμƒν–ˆμŠ΅λ‹ˆλ‹€: {error_msg}"
80
 
81
+ # Create Gradio interface
82
  iface = gr.Interface(
83
  fn=get_transcript,
84
+ inputs=gr.Textbox(
85
+ label="YouTube URL",
86
+ placeholder="https://www.youtube.com/watch?v=..."
87
+ ),
88
+ outputs=gr.Textbox(
89
+ label="μΆ”μΆœλœ 슀크립트",
90
+ lines=20
91
+ ),
92
+ title="YouTube μžλ§‰ μΆ”μΆœκΈ°",
93
+ description="YouTube μ˜μƒμ˜ URL을 μž…λ ₯ν•˜λ©΄ μžλ§‰μ„ μΆ”μΆœν•©λ‹ˆλ‹€. (ν•œκ΅­μ–΄ μš°μ„ , μ˜μ–΄ μ°¨μ„ )",
94
+ allow_flagging="never"
 
 
 
 
95
  )
96
 
97
+ # Launch the app
 
 
 
 
98
  if __name__ == "__main__":
99
+ iface.launch(server_name="0.0.0.0")