Spaces:

JunyiAcademy
/

vaitor2

Running

App Files Files Community

youngtsai commited on Dec 17, 2024

Commit

5ec7e82

1 Parent(s): 6cdebb1

gemini to transcription

Browse files

Files changed (1) hide show

app.py +148 -25

app.py CHANGED Viewed

@@ -370,6 +370,125 @@ def get_transcript_by_yt_api(video_id):
             continue  # 當前語言的字幕沒有找到，繼續嘗試下一個語言
     return None  # 所有嘗試都失敗，返回None
 def generate_transcription_by_whisper(video_id):
     youtube_url = f'https://www.youtube.com/watch?v={video_id}'
     codec_name = "mp3"
@@ -447,35 +566,43 @@ def process_transcript_and_screenshots_on_gcs(video_id):
     print("====process_transcript_and_screenshots_on_gcs====")
     transcript, exists = get_transcript_from_gcs(video_id)
     if not exists:
-        print("Transcript file does not exist, creating new transcript...")
-        transcript = generate_transcription_by_whisper(video_id)
         upload_transcript_to_gcs(video_id, transcript)
     # 處理截圖
     is_new_transcript = False
     for entry in transcript:
         if 'img_file_id' not in entry:
             # 檢查 OUTPUT_PATH 是否存在 video_id.mp4
             video_path = f'{OUTPUT_PATH}/{video_id}.mp4'
-            if not os.path.exists(video_path):
-                # try 5 times 如果都失敗就 raise
-                for i in range(5):
-                    try:
-                        download_youtube_video(video_id)
-                        break
-                    except Exception as e:
-                        if i == 4:
-                            raise gr.Error(f"下载视频失败: {str(e)}")
-                        time.sleep(5)
-            try:
-                screenshot_path = screenshot_youtube_video(video_id, entry['start'])
-                screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
-                img_file_id = GCS_SERVICE.upload_image_and_get_public_url('video_ai_assistant', screenshot_blob_name, screenshot_path)
-                entry['img_file_id'] = img_file_id
-                print(f"截图已上传到GCS: {img_file_id}")
                 is_new_transcript = True
-            except Exception as e:
-                print(f"Error processing screenshot: {str(e)}")
     if is_new_transcript:
         print("===更新逐字稿文件===")
@@ -3090,7 +3217,6 @@ Hi，我是【飛特音速】，說話比較快，但有什麼問題都可以問
 latex_delimiters = [{"left": "$", "right": "$", "display": False}]
 streaming_ai_chatbot = gr.Chatbot(
     show_share_button=False,
-    likeable=True,
     latex_delimiters=latex_delimiters,
     show_copy_button=True,
 )
@@ -3233,9 +3359,6 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
                     chatbot=streaming_ai_chatbot,
                     additional_inputs=additional_inputs,
                     submit_btn="送出",
-                    retry_btn=None,
-                    undo_btn="⏪ 上一步",
-                    clear_btn="🗑️ 清除全部",
                     stop_btn=None,
                     description=streaming_chat_greeting
                 )
@@ -3251,7 +3374,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
                     """,
                 ]]
                 with gr.Row():
-                    ai_chatbot = gr.Chatbot(label="ai_chatbot", show_share_button=False, likeable=True, show_label=False, latex_delimiters=latex_delimiters, value=ai_chatbot_greeting)
                 with gr.Row():
                     with gr.Accordion("你也有類似的問題想問嗎？ 請按下 ◀︎", open=False) as ask_questions_accordion_2:
                         ai_chatbot_question_1 = gr.Button("問題一")

             continue  # 當前語言的字幕沒有找到，繼續嘗試下一個語言
     return None  # 所有嘗試都失敗，返回None
+def generate_transcription_by_gemini(video_id):
+    """使用 Google Gemini 生成影片逐字稿"""
+    # 準備 YouTube 影片 URL
+    video_url = f"https://www.youtube.com/watch?v={video_id}"
+    # 初始化 Gemini Pro Vision 模型
+    model = vertexai.generative_models.GenerativeModel("gemini-2.0-flash-exp")
+    # 建立影片部分
+    video_part = Part.from_uri(
+        uri=video_url,
+        mime_type="video/*"
+    )
+    # 設定提示詞
+    prompt = "給我包含時間軸的逐字稿，只需要給我有講話的時間軸跟內容，其他時間軸不需要"
+    # 生成逐字稿
+    original_transcription = ""
+    try:
+        response = model.generate_content(
+            contents=[video_part, prompt],
+            generation_config=vertexai.generative_models.GenerationConfig(
+                temperature=1.0,
+                top_p=0.95,
+                max_output_tokens=8192,
+                candidate_count=1
+            ),
+            stream=False
+        )
+        original_transcription = response.candidates[0].content.parts[0].text
+        print("===original_transcription===")
+        print(original_transcription)
+        print("===original_transcription===")
+        # 轉換成 JSON 格式
+        transcript_json = convert_transcription_to_json(original_transcription)
+        if transcript_json:
+            return transcript_json
+        else:
+            raise Exception("無法轉換逐字稿格式")
+    except Exception as e:
+        print(f"生成逐字稿時發生錯誤：{str(e)}")
+        return None
+def convert_transcription_to_json(original_transcription):
+    """
+    將原始逐字稿轉換成指定的 JSON 格式
+    Args:
+        original_transcription (str): 原始逐字稿文本
+    Returns:
+        list: 包含逐字稿段落的列表，每個段落包含 text, start, end, duration
+    """
+    # 使用 Vertex AI 來處理轉換
+    model = vertexai.generative_models.GenerativeModel("gemini-2.0-flash-exp")
+    prompt = f"""
+    請將以下逐字稿轉換成 JSON 格式:
+    {original_transcription}
+    轉換規則:
+    1. 每個段落需包含 text, start, end, duration
+    2. 時間格式需轉換為秒數(例如 1:02 轉為 62 秒)
+    3. duration 為 end - start 的差值
+    4. 回傳格式為 JSON array
+    範例輸出格式:
+    [
+        {{
+            "text": "在一片無人的森林裡",
+            "start": 1,
+            "end": 2,
+            "duration": 1
+        }},
+        {{
+            "text": "你撿到一張羊皮紙",
+            "start": 2,
+            "end": 4,
+            "duration": 2
+        }}
+    ]
+    請直接返回 JSON 格式，不要加入任何說明文字。
+    """
+    try:
+        response = model.generate_content(prompt)
+        json_str = response.text
+        print("===json_str===")
+        print(json_str)
+        print("===json_str===")
+        # 移除可能的 markdown 標記
+        json_str = json_str.replace("```json", "").replace("```", "").strip()
+        # 解析 JSON
+        transcript_json = json.loads(json_str)
+        # 驗證格式
+        for entry in transcript_json:
+            if not all(k in entry for k in ["text", "start", "end", "duration"]):
+                raise ValueError("JSON 格式錯誤：缺少必要欄位")
+        return transcript_json
+    except Exception as e:
+        print(f"轉換逐字稿時發生錯誤：{str(e)}")
+        return None
 def generate_transcription_by_whisper(video_id):
     youtube_url = f'https://www.youtube.com/watch?v={video_id}'
     codec_name = "mp3"
     print("====process_transcript_and_screenshots_on_gcs====")
     transcript, exists = get_transcript_from_gcs(video_id)
     if not exists:
+        try:
+            transcript = generate_transcription_by_gemini(video_id)
+        except Exception as e:
+            print(f"generate_transcription_by_gemini Error generating transcription: {str(e)}")
+            transcript = generate_transcription_by_whisper(video_id)
         upload_transcript_to_gcs(video_id, transcript)
     # 處理截圖
     is_new_transcript = False
+    has_tried_download_video = False
     for entry in transcript:
         if 'img_file_id' not in entry:
             # 檢查 OUTPUT_PATH 是否存在 video_id.mp4
             video_path = f'{OUTPUT_PATH}/{video_id}.mp4'
+            # 沒有影片或是沒有下載過
+            if not os.path.exists(video_path) or not has_tried_download_video:
+                try:
+                    download_youtube_video(video_id)
+                except Exception as e:
+                    has_tried_download_video = True
+                    print(f"下载视频失败: {str(e)}")
+            if os.path.exists(video_path):
+                try:
+                    screenshot_path = screenshot_youtube_video(video_id, entry['start'])
+                    screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
+                    img_file_id = GCS_SERVICE.upload_image_and_get_public_url('video_ai_assistant', screenshot_blob_name, screenshot_path)
+                    entry['img_file_id'] = img_file_id
+                    print(f"截图已上传到GCS: {img_file_id}")
+                    is_new_transcript = True
+                except Exception as e:
+                    print(f"Error processing screenshot: {str(e)}")
+            else:
+                entry['img_file_id'] = ""
+                print(f"截圖空白")
                 is_new_transcript = True
     if is_new_transcript:
         print("===更新逐字稿文件===")
 latex_delimiters = [{"left": "$", "right": "$", "display": False}]
 streaming_ai_chatbot = gr.Chatbot(
     show_share_button=False,
     latex_delimiters=latex_delimiters,
     show_copy_button=True,
 )
                     chatbot=streaming_ai_chatbot,
                     additional_inputs=additional_inputs,
                     submit_btn="送出",
                     stop_btn=None,
                     description=streaming_chat_greeting
                 )
                     """,
                 ]]
                 with gr.Row():
+                    ai_chatbot = gr.Chatbot(label="ai_chatbot", show_share_button=False, show_label=False, latex_delimiters=latex_delimiters, value=ai_chatbot_greeting)
                 with gr.Row():
                     with gr.Accordion("你也有類似的問題想問嗎？ 請按下 ◀︎", open=False) as ask_questions_accordion_2:
                         ai_chatbot_question_1 = gr.Button("問題一")