youngtsai commited on
Commit
5ec7e82
·
1 Parent(s): 6cdebb1

gemini to transcription

Browse files
Files changed (1) hide show
  1. app.py +148 -25
app.py CHANGED
@@ -370,6 +370,125 @@ def get_transcript_by_yt_api(video_id):
370
  continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
371
  return None # 所有嘗試都失敗,返回None
372
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
373
  def generate_transcription_by_whisper(video_id):
374
  youtube_url = f'https://www.youtube.com/watch?v={video_id}'
375
  codec_name = "mp3"
@@ -447,35 +566,43 @@ def process_transcript_and_screenshots_on_gcs(video_id):
447
  print("====process_transcript_and_screenshots_on_gcs====")
448
  transcript, exists = get_transcript_from_gcs(video_id)
449
  if not exists:
450
- print("Transcript file does not exist, creating new transcript...")
451
- transcript = generate_transcription_by_whisper(video_id)
 
 
 
 
452
  upload_transcript_to_gcs(video_id, transcript)
453
 
454
  # 處理截圖
455
  is_new_transcript = False
 
456
  for entry in transcript:
457
  if 'img_file_id' not in entry:
458
  # 檢查 OUTPUT_PATH 是否存在 video_id.mp4
459
  video_path = f'{OUTPUT_PATH}/{video_id}.mp4'
460
- if not os.path.exists(video_path):
461
- # try 5 times 如果都失敗就 raise
462
- for i in range(5):
463
- try:
464
- download_youtube_video(video_id)
465
- break
466
- except Exception as e:
467
- if i == 4:
468
- raise gr.Error(f"下载视频失败: {str(e)}")
469
- time.sleep(5)
470
- try:
471
- screenshot_path = screenshot_youtube_video(video_id, entry['start'])
472
- screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
473
- img_file_id = GCS_SERVICE.upload_image_and_get_public_url('video_ai_assistant', screenshot_blob_name, screenshot_path)
474
- entry['img_file_id'] = img_file_id
475
- print(f"截图已上传到GCS: {img_file_id}")
 
 
 
 
 
476
  is_new_transcript = True
477
- except Exception as e:
478
- print(f"Error processing screenshot: {str(e)}")
479
 
480
  if is_new_transcript:
481
  print("===更新逐字稿文件===")
@@ -3090,7 +3217,6 @@ Hi,我是【飛特音速】,說話比較快,但有什麼問題都可以問
3090
  latex_delimiters = [{"left": "$", "right": "$", "display": False}]
3091
  streaming_ai_chatbot = gr.Chatbot(
3092
  show_share_button=False,
3093
- likeable=True,
3094
  latex_delimiters=latex_delimiters,
3095
  show_copy_button=True,
3096
  )
@@ -3233,9 +3359,6 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3233
  chatbot=streaming_ai_chatbot,
3234
  additional_inputs=additional_inputs,
3235
  submit_btn="送出",
3236
- retry_btn=None,
3237
- undo_btn="⏪ 上一步",
3238
- clear_btn="🗑️ 清除全部",
3239
  stop_btn=None,
3240
  description=streaming_chat_greeting
3241
  )
@@ -3251,7 +3374,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3251
  """,
3252
  ]]
3253
  with gr.Row():
3254
- ai_chatbot = gr.Chatbot(label="ai_chatbot", show_share_button=False, likeable=True, show_label=False, latex_delimiters=latex_delimiters, value=ai_chatbot_greeting)
3255
  with gr.Row():
3256
  with gr.Accordion("你也有類似的問題想問嗎? 請按下 ◀︎", open=False) as ask_questions_accordion_2:
3257
  ai_chatbot_question_1 = gr.Button("問題一")
 
370
  continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
371
  return None # 所有嘗試都失敗,返回None
372
 
373
+ def generate_transcription_by_gemini(video_id):
374
+ """使用 Google Gemini 生成影片逐字稿"""
375
+ # 準備 YouTube 影片 URL
376
+ video_url = f"https://www.youtube.com/watch?v={video_id}"
377
+
378
+ # 初始化 Gemini Pro Vision 模型
379
+ model = vertexai.generative_models.GenerativeModel("gemini-2.0-flash-exp")
380
+
381
+ # 建立影片部分
382
+ video_part = Part.from_uri(
383
+ uri=video_url,
384
+ mime_type="video/*"
385
+ )
386
+
387
+ # 設定提示詞
388
+ prompt = "給我包含時間軸的逐字稿,只需要給我有講話的時間軸跟內容,其他時間軸不需要"
389
+
390
+ # 生成逐字稿
391
+ original_transcription = ""
392
+ try:
393
+ response = model.generate_content(
394
+ contents=[video_part, prompt],
395
+ generation_config=vertexai.generative_models.GenerationConfig(
396
+ temperature=1.0,
397
+ top_p=0.95,
398
+ max_output_tokens=8192,
399
+ candidate_count=1
400
+ ),
401
+ stream=False
402
+ )
403
+
404
+ original_transcription = response.candidates[0].content.parts[0].text
405
+
406
+
407
+ print("===original_transcription===")
408
+ print(original_transcription)
409
+ print("===original_transcription===")
410
+
411
+
412
+ # 轉換成 JSON 格式
413
+ transcript_json = convert_transcription_to_json(original_transcription)
414
+
415
+ if transcript_json:
416
+ return transcript_json
417
+ else:
418
+ raise Exception("無法轉換逐字稿格式")
419
+
420
+ except Exception as e:
421
+ print(f"生成逐字稿時發生錯誤:{str(e)}")
422
+ return None
423
+
424
+ def convert_transcription_to_json(original_transcription):
425
+ """
426
+ 將原始逐字稿轉換成指定的 JSON 格式
427
+
428
+ Args:
429
+ original_transcription (str): 原始逐字稿文本
430
+
431
+ Returns:
432
+ list: 包含逐字稿段落的列表,每個段落包含 text, start, end, duration
433
+ """
434
+
435
+ # 使用 Vertex AI 來處理轉換
436
+ model = vertexai.generative_models.GenerativeModel("gemini-2.0-flash-exp")
437
+
438
+ prompt = f"""
439
+ 請將以下逐字稿轉換成 JSON 格式:
440
+ {original_transcription}
441
+
442
+ 轉換規則:
443
+ 1. 每個段落需包含 text, start, end, duration
444
+ 2. 時間格式需轉換為秒數(例如 1:02 轉為 62 秒)
445
+ 3. duration 為 end - start 的差值
446
+ 4. 回傳格式為 JSON array
447
+
448
+ 範例輸出格式:
449
+ [
450
+ {{
451
+ "text": "在一片無人的森林裡",
452
+ "start": 1,
453
+ "end": 2,
454
+ "duration": 1
455
+ }},
456
+ {{
457
+ "text": "你撿到一張羊皮紙",
458
+ "start": 2,
459
+ "end": 4,
460
+ "duration": 2
461
+ }}
462
+ ]
463
+
464
+ 請直接返回 JSON 格式,不要加入任何說明文字。
465
+ """
466
+
467
+ try:
468
+ response = model.generate_content(prompt)
469
+ json_str = response.text
470
+
471
+ print("===json_str===")
472
+ print(json_str)
473
+ print("===json_str===")
474
+
475
+ # 移除可能的 markdown 標記
476
+ json_str = json_str.replace("```json", "").replace("```", "").strip()
477
+
478
+ # 解析 JSON
479
+ transcript_json = json.loads(json_str)
480
+
481
+ # 驗證格式
482
+ for entry in transcript_json:
483
+ if not all(k in entry for k in ["text", "start", "end", "duration"]):
484
+ raise ValueError("JSON 格式錯誤:缺少必要欄位")
485
+
486
+ return transcript_json
487
+
488
+ except Exception as e:
489
+ print(f"轉換逐字稿時發生錯誤:{str(e)}")
490
+ return None
491
+
492
  def generate_transcription_by_whisper(video_id):
493
  youtube_url = f'https://www.youtube.com/watch?v={video_id}'
494
  codec_name = "mp3"
 
566
  print("====process_transcript_and_screenshots_on_gcs====")
567
  transcript, exists = get_transcript_from_gcs(video_id)
568
  if not exists:
569
+ try:
570
+ transcript = generate_transcription_by_gemini(video_id)
571
+ except Exception as e:
572
+ print(f"generate_transcription_by_gemini Error generating transcription: {str(e)}")
573
+ transcript = generate_transcription_by_whisper(video_id)
574
+
575
  upload_transcript_to_gcs(video_id, transcript)
576
 
577
  # 處理截圖
578
  is_new_transcript = False
579
+ has_tried_download_video = False
580
  for entry in transcript:
581
  if 'img_file_id' not in entry:
582
  # 檢查 OUTPUT_PATH 是否存在 video_id.mp4
583
  video_path = f'{OUTPUT_PATH}/{video_id}.mp4'
584
+ # 沒有影片或是沒有下載過
585
+ if not os.path.exists(video_path) or not has_tried_download_video:
586
+ try:
587
+ download_youtube_video(video_id)
588
+ except Exception as e:
589
+ has_tried_download_video = True
590
+ print(f"下载视频失败: {str(e)}")
591
+
592
+ if os.path.exists(video_path):
593
+ try:
594
+ screenshot_path = screenshot_youtube_video(video_id, entry['start'])
595
+ screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
596
+ img_file_id = GCS_SERVICE.upload_image_and_get_public_url('video_ai_assistant', screenshot_blob_name, screenshot_path)
597
+ entry['img_file_id'] = img_file_id
598
+ print(f"截图已上传到GCS: {img_file_id}")
599
+ is_new_transcript = True
600
+ except Exception as e:
601
+ print(f"Error processing screenshot: {str(e)}")
602
+ else:
603
+ entry['img_file_id'] = ""
604
+ print(f"截圖空白")
605
  is_new_transcript = True
 
 
606
 
607
  if is_new_transcript:
608
  print("===更新逐字稿文件===")
 
3217
  latex_delimiters = [{"left": "$", "right": "$", "display": False}]
3218
  streaming_ai_chatbot = gr.Chatbot(
3219
  show_share_button=False,
 
3220
  latex_delimiters=latex_delimiters,
3221
  show_copy_button=True,
3222
  )
 
3359
  chatbot=streaming_ai_chatbot,
3360
  additional_inputs=additional_inputs,
3361
  submit_btn="送出",
 
 
 
3362
  stop_btn=None,
3363
  description=streaming_chat_greeting
3364
  )
 
3374
  """,
3375
  ]]
3376
  with gr.Row():
3377
+ ai_chatbot = gr.Chatbot(label="ai_chatbot", show_share_button=False, show_label=False, latex_delimiters=latex_delimiters, value=ai_chatbot_greeting)
3378
  with gr.Row():
3379
  with gr.Accordion("你也有類似的問題想問嗎? 請按下 ◀︎", open=False) as ask_questions_accordion_2:
3380
  ai_chatbot_question_1 = gr.Button("問題一")