Spaces:

JunyiAcademy
/

vaitor2

Running

App Files Files Community

youngtsai commited on Feb 4, 2024

Commit

f99c291

1 Parent(s): b300db2

transcript = process_transcript_and_screenshots(video_id)

Browse files

Files changed (1) hide show

app.py +54 -15

app.py CHANGED Viewed

@@ -136,6 +136,27 @@ def set_public_permission(service, file_id):
         fields='id',
     ).execute()
 def process_file(file):
     # 读取文件
     if file.name.endswith('.csv'):
@@ -193,33 +214,48 @@ def extract_youtube_id(url):
     else:
         return None
-def process_youtube_link(link):
-    # 使用 YouTube API 获取逐字稿
-    # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
-    video_id = extract_youtube_id(link)
     service = init_drive_service()
-    parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL'  # youtube逐字稿圖檔的ID
-    # 检查/创建视频ID命名的子文件夹
     folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id)
-    file_name = f"{video_id}_transcript.txt"
     # 检查逐字稿是否存在
-    transcript = None
     exists, file_id = check_file_exists(service, folder_id, file_name)
     if not exists:
         transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW'])
         transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
-        upload_content_directly(service, file_name, folder_id, transcript_text)
         print("逐字稿已上传到Google Drive")
     else:
         print("逐字稿已存在于Google Drive中")
         transcript_text = download_file_as_string(service, file_id)
         transcript = json.loads(transcript_text)
-    # 基于逐字稿生成其他所需的输出
-    questions = generate_questions(transcript)
-    df_summarise = generate_df_summarise(transcript)
     formatted_transcript = []
     screenshot_paths = []
@@ -228,8 +264,7 @@ def process_youtube_link(link):
         start_time = format_seconds_to_time(entry['start'])
         end_time = format_seconds_to_time(entry['start'] + entry['duration'])
         embed_url = get_embedded_youtube_link(video_id, entry['start'])
-        # 截圖
-        screenshot_path = screenshot_youtube_video(video_id, entry['start'])
         line = {
             "start_time": start_time,
             "end_time": end_time,
@@ -245,6 +280,10 @@ def process_youtube_link(link):
     print(html_content)
     print("=====html_content=====")
     # 确保返回与 UI 组件预期匹配的输出
     return questions[0] if len(questions) > 0 else "", \
             questions[1] if len(questions) > 1 else "", \

         fields='id',
     ).execute()
+def update_file_on_drive(service, file_id, file_content):
+    """
+    更新Google Drive上的文件内容。
+    参数:
+    - service: Google Drive API服务实例。
+    - file_id: 要更新的文件的ID。
+    - file_content: 新的文件内容，字符串格式。
+    """
+    # 将新的文件内容转换为字节流
+    fh = io.BytesIO(file_content.encode('utf-8'))
+    media = MediaIoBaseUpload(fh, mimetype='application/json', resumable=True)
+    # 更新文件
+    updated_file = service.files().update(
+        fileId=file_id,
+        media_body=media
+    ).execute()
+    print(f"文件已更新，文件ID: {updated_file['id']}")
 def process_file(file):
     # 读取文件
     if file.name.endswith('.csv'):
     else:
         return None
+def process_transcript_and_screenshots(video_id):
     service = init_drive_service()
+    parent_folder_id = '1GgI4YVs0KckwStVQkLa1NZ8IpaEMurkL'
     folder_id = create_folder_if_not_exists(service, video_id, parent_folder_id)
+    file_name = f'{video_id}_transcript.json'
     # 检查逐字稿是否存在
     exists, file_id = check_file_exists(service, folder_id, file_name)
     if not exists:
+        # 从YouTube获取逐字稿并上传
         transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['zh-TW'])
         transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
+        file_id = upload_content_directly(service, file_name, folder_id, transcript_text)
         print("逐字稿已上传到Google Drive")
     else:
+        # 逐字稿已存在，下载逐字稿内容
         print("逐字稿已存在于Google Drive中")
         transcript_text = download_file_as_string(service, file_id)
         transcript = json.loads(transcript_text)
+    # 处理逐字稿中的每个条目，检查并上传截图
+    for entry in transcript:
+        if 'img_src' not in entry:
+            screenshot_path = screenshot_youtube_video(video_id, entry['start'])
+            img_file_id = upload_img_directly(service, f"{video_id}_{entry['start']}.jpg", folder_id, screenshot_path)
+            img_src = f"https://drive.google.com/uc?export=view&id={img_file_id}"
+            entry['img_src'] = img_src
+            # 删除本地截图文件
+            os.remove(screenshot_path)
+    # 更新逐字稿文件
+    updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
+    update_file_on_drive(service, file_id, updated_transcript_text)
+    print("逐字稿已更新，包括截图链接")
+    return transcript
+def process_youtube_link(link):
+    # 使用 YouTube API 获取逐字稿
+    # 假设您已经获取了 YouTube 视频的逐字稿并存储在变量 `transcript` 中
+    video_id = extract_youtube_id(link)
+    transcript = process_transcript_and_screenshots(video_id)
     formatted_transcript = []
     screenshot_paths = []
         start_time = format_seconds_to_time(entry['start'])
         end_time = format_seconds_to_time(entry['start'] + entry['duration'])
         embed_url = get_embedded_youtube_link(video_id, entry['start'])
+        screenshot_path = entry['img_src']
         line = {
             "start_time": start_time,
             "end_time": end_time,
     print(html_content)
     print("=====html_content=====")
+    # 基于逐字稿生成其他所需的输出
+    questions = generate_questions(transcript)
+    df_summarise = generate_df_summarise(transcript)
     # 确保返回与 UI 组件预期匹配的输出
     return questions[0] if len(questions) > 0 else "", \
             questions[1] if len(questions) > 1 else "", \