Spaces:
Running
Running
gemini to transcription
Browse files
app.py
CHANGED
@@ -370,6 +370,125 @@ def get_transcript_by_yt_api(video_id):
|
|
370 |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
|
371 |
return None # 所有嘗試都失敗,返回None
|
372 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
373 |
def generate_transcription_by_whisper(video_id):
|
374 |
youtube_url = f'https://www.youtube.com/watch?v={video_id}'
|
375 |
codec_name = "mp3"
|
@@ -447,35 +566,43 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
447 |
print("====process_transcript_and_screenshots_on_gcs====")
|
448 |
transcript, exists = get_transcript_from_gcs(video_id)
|
449 |
if not exists:
|
450 |
-
|
451 |
-
|
|
|
|
|
|
|
|
|
452 |
upload_transcript_to_gcs(video_id, transcript)
|
453 |
|
454 |
# 處理截圖
|
455 |
is_new_transcript = False
|
|
|
456 |
for entry in transcript:
|
457 |
if 'img_file_id' not in entry:
|
458 |
# 檢查 OUTPUT_PATH 是否存在 video_id.mp4
|
459 |
video_path = f'{OUTPUT_PATH}/{video_id}.mp4'
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
|
|
|
|
|
|
|
|
|
|
476 |
is_new_transcript = True
|
477 |
-
except Exception as e:
|
478 |
-
print(f"Error processing screenshot: {str(e)}")
|
479 |
|
480 |
if is_new_transcript:
|
481 |
print("===更新逐字稿文件===")
|
@@ -3090,7 +3217,6 @@ Hi,我是【飛特音速】,說話比較快,但有什麼問題都可以問
|
|
3090 |
latex_delimiters = [{"left": "$", "right": "$", "display": False}]
|
3091 |
streaming_ai_chatbot = gr.Chatbot(
|
3092 |
show_share_button=False,
|
3093 |
-
likeable=True,
|
3094 |
latex_delimiters=latex_delimiters,
|
3095 |
show_copy_button=True,
|
3096 |
)
|
@@ -3233,9 +3359,6 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3233 |
chatbot=streaming_ai_chatbot,
|
3234 |
additional_inputs=additional_inputs,
|
3235 |
submit_btn="送出",
|
3236 |
-
retry_btn=None,
|
3237 |
-
undo_btn="⏪ 上一步",
|
3238 |
-
clear_btn="🗑️ 清除全部",
|
3239 |
stop_btn=None,
|
3240 |
description=streaming_chat_greeting
|
3241 |
)
|
@@ -3251,7 +3374,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3251 |
""",
|
3252 |
]]
|
3253 |
with gr.Row():
|
3254 |
-
ai_chatbot = gr.Chatbot(label="ai_chatbot", show_share_button=False,
|
3255 |
with gr.Row():
|
3256 |
with gr.Accordion("你也有類似的問題想問嗎? 請按下 ◀︎", open=False) as ask_questions_accordion_2:
|
3257 |
ai_chatbot_question_1 = gr.Button("問題一")
|
|
|
370 |
continue # 當前語言的字幕沒有找到,繼續嘗試下一個語言
|
371 |
return None # 所有嘗試都失敗,返回None
|
372 |
|
373 |
+
def generate_transcription_by_gemini(video_id):
|
374 |
+
"""使用 Google Gemini 生成影片逐字稿"""
|
375 |
+
# 準備 YouTube 影片 URL
|
376 |
+
video_url = f"https://www.youtube.com/watch?v={video_id}"
|
377 |
+
|
378 |
+
# 初始化 Gemini Pro Vision 模型
|
379 |
+
model = vertexai.generative_models.GenerativeModel("gemini-2.0-flash-exp")
|
380 |
+
|
381 |
+
# 建立影片部分
|
382 |
+
video_part = Part.from_uri(
|
383 |
+
uri=video_url,
|
384 |
+
mime_type="video/*"
|
385 |
+
)
|
386 |
+
|
387 |
+
# 設定提示詞
|
388 |
+
prompt = "給我包含時間軸的逐字稿,只需要給我有講話的時間軸跟內容,其他時間軸不需要"
|
389 |
+
|
390 |
+
# 生成逐字稿
|
391 |
+
original_transcription = ""
|
392 |
+
try:
|
393 |
+
response = model.generate_content(
|
394 |
+
contents=[video_part, prompt],
|
395 |
+
generation_config=vertexai.generative_models.GenerationConfig(
|
396 |
+
temperature=1.0,
|
397 |
+
top_p=0.95,
|
398 |
+
max_output_tokens=8192,
|
399 |
+
candidate_count=1
|
400 |
+
),
|
401 |
+
stream=False
|
402 |
+
)
|
403 |
+
|
404 |
+
original_transcription = response.candidates[0].content.parts[0].text
|
405 |
+
|
406 |
+
|
407 |
+
print("===original_transcription===")
|
408 |
+
print(original_transcription)
|
409 |
+
print("===original_transcription===")
|
410 |
+
|
411 |
+
|
412 |
+
# 轉換成 JSON 格式
|
413 |
+
transcript_json = convert_transcription_to_json(original_transcription)
|
414 |
+
|
415 |
+
if transcript_json:
|
416 |
+
return transcript_json
|
417 |
+
else:
|
418 |
+
raise Exception("無法轉換逐字稿格式")
|
419 |
+
|
420 |
+
except Exception as e:
|
421 |
+
print(f"生成逐字稿時發生錯誤:{str(e)}")
|
422 |
+
return None
|
423 |
+
|
424 |
+
def convert_transcription_to_json(original_transcription):
|
425 |
+
"""
|
426 |
+
將原始逐字稿轉換成指定的 JSON 格式
|
427 |
+
|
428 |
+
Args:
|
429 |
+
original_transcription (str): 原始逐字稿文本
|
430 |
+
|
431 |
+
Returns:
|
432 |
+
list: 包含逐字稿段落的列表,每個段落包含 text, start, end, duration
|
433 |
+
"""
|
434 |
+
|
435 |
+
# 使用 Vertex AI 來處理轉換
|
436 |
+
model = vertexai.generative_models.GenerativeModel("gemini-2.0-flash-exp")
|
437 |
+
|
438 |
+
prompt = f"""
|
439 |
+
請將以下逐字稿轉換成 JSON 格式:
|
440 |
+
{original_transcription}
|
441 |
+
|
442 |
+
轉換規則:
|
443 |
+
1. 每個段落需包含 text, start, end, duration
|
444 |
+
2. 時間格式需轉換為秒數(例如 1:02 轉為 62 秒)
|
445 |
+
3. duration 為 end - start 的差值
|
446 |
+
4. 回傳格式為 JSON array
|
447 |
+
|
448 |
+
範例輸出格式:
|
449 |
+
[
|
450 |
+
{{
|
451 |
+
"text": "在一片無人的森林裡",
|
452 |
+
"start": 1,
|
453 |
+
"end": 2,
|
454 |
+
"duration": 1
|
455 |
+
}},
|
456 |
+
{{
|
457 |
+
"text": "你撿到一張羊皮紙",
|
458 |
+
"start": 2,
|
459 |
+
"end": 4,
|
460 |
+
"duration": 2
|
461 |
+
}}
|
462 |
+
]
|
463 |
+
|
464 |
+
請直接返回 JSON 格式,不要加入任何說明文字。
|
465 |
+
"""
|
466 |
+
|
467 |
+
try:
|
468 |
+
response = model.generate_content(prompt)
|
469 |
+
json_str = response.text
|
470 |
+
|
471 |
+
print("===json_str===")
|
472 |
+
print(json_str)
|
473 |
+
print("===json_str===")
|
474 |
+
|
475 |
+
# 移除可能的 markdown 標記
|
476 |
+
json_str = json_str.replace("```json", "").replace("```", "").strip()
|
477 |
+
|
478 |
+
# 解析 JSON
|
479 |
+
transcript_json = json.loads(json_str)
|
480 |
+
|
481 |
+
# 驗證格式
|
482 |
+
for entry in transcript_json:
|
483 |
+
if not all(k in entry for k in ["text", "start", "end", "duration"]):
|
484 |
+
raise ValueError("JSON 格式錯誤:缺少必要欄位")
|
485 |
+
|
486 |
+
return transcript_json
|
487 |
+
|
488 |
+
except Exception as e:
|
489 |
+
print(f"轉換逐字稿時發生錯誤:{str(e)}")
|
490 |
+
return None
|
491 |
+
|
492 |
def generate_transcription_by_whisper(video_id):
|
493 |
youtube_url = f'https://www.youtube.com/watch?v={video_id}'
|
494 |
codec_name = "mp3"
|
|
|
566 |
print("====process_transcript_and_screenshots_on_gcs====")
|
567 |
transcript, exists = get_transcript_from_gcs(video_id)
|
568 |
if not exists:
|
569 |
+
try:
|
570 |
+
transcript = generate_transcription_by_gemini(video_id)
|
571 |
+
except Exception as e:
|
572 |
+
print(f"generate_transcription_by_gemini Error generating transcription: {str(e)}")
|
573 |
+
transcript = generate_transcription_by_whisper(video_id)
|
574 |
+
|
575 |
upload_transcript_to_gcs(video_id, transcript)
|
576 |
|
577 |
# 處理截圖
|
578 |
is_new_transcript = False
|
579 |
+
has_tried_download_video = False
|
580 |
for entry in transcript:
|
581 |
if 'img_file_id' not in entry:
|
582 |
# 檢查 OUTPUT_PATH 是否存在 video_id.mp4
|
583 |
video_path = f'{OUTPUT_PATH}/{video_id}.mp4'
|
584 |
+
# 沒有影片或是沒有下載過
|
585 |
+
if not os.path.exists(video_path) or not has_tried_download_video:
|
586 |
+
try:
|
587 |
+
download_youtube_video(video_id)
|
588 |
+
except Exception as e:
|
589 |
+
has_tried_download_video = True
|
590 |
+
print(f"下载视频失败: {str(e)}")
|
591 |
+
|
592 |
+
if os.path.exists(video_path):
|
593 |
+
try:
|
594 |
+
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
595 |
+
screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
|
596 |
+
img_file_id = GCS_SERVICE.upload_image_and_get_public_url('video_ai_assistant', screenshot_blob_name, screenshot_path)
|
597 |
+
entry['img_file_id'] = img_file_id
|
598 |
+
print(f"截图已上传到GCS: {img_file_id}")
|
599 |
+
is_new_transcript = True
|
600 |
+
except Exception as e:
|
601 |
+
print(f"Error processing screenshot: {str(e)}")
|
602 |
+
else:
|
603 |
+
entry['img_file_id'] = ""
|
604 |
+
print(f"截圖空白")
|
605 |
is_new_transcript = True
|
|
|
|
|
606 |
|
607 |
if is_new_transcript:
|
608 |
print("===更新逐字稿文件===")
|
|
|
3217 |
latex_delimiters = [{"left": "$", "right": "$", "display": False}]
|
3218 |
streaming_ai_chatbot = gr.Chatbot(
|
3219 |
show_share_button=False,
|
|
|
3220 |
latex_delimiters=latex_delimiters,
|
3221 |
show_copy_button=True,
|
3222 |
)
|
|
|
3359 |
chatbot=streaming_ai_chatbot,
|
3360 |
additional_inputs=additional_inputs,
|
3361 |
submit_btn="送出",
|
|
|
|
|
|
|
3362 |
stop_btn=None,
|
3363 |
description=streaming_chat_greeting
|
3364 |
)
|
|
|
3374 |
""",
|
3375 |
]]
|
3376 |
with gr.Row():
|
3377 |
+
ai_chatbot = gr.Chatbot(label="ai_chatbot", show_share_button=False, show_label=False, latex_delimiters=latex_delimiters, value=ai_chatbot_greeting)
|
3378 |
with gr.Row():
|
3379 |
with gr.Accordion("你也有類似的問題想問嗎? 請按下 ◀︎", open=False) as ask_questions_accordion_2:
|
3380 |
ai_chatbot_question_1 = gr.Button("問題一")
|