youngtsai commited on
Commit
0af8b29
·
1 Parent(s): 5324cd6
Files changed (1) hide show
  1. app.py +120 -154
app.py CHANGED
@@ -503,7 +503,7 @@ def upload_transcript_to_gcs(video_id, transcript):
503
  GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
504
  print("Transcript uploaded successfully.")
505
 
506
- def process_youtube_link(password, link):
507
  verify_password(password)
508
  video_id = extract_youtube_id(link)
509
 
@@ -545,21 +545,21 @@ def process_youtube_link(password, link):
545
 
546
  # 基于逐字稿生成其他所需的输出
547
  source = "gcs"
548
- questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source)
549
  questions_answers_json = json.dumps(questions_answers, ensure_ascii=False, indent=2)
550
- summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source)
551
  summary_text = summary_json["summary"]
552
  summary = summary_json["summary"]
553
- key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source)
554
  key_moments = key_moments_json["key_moments"]
555
  key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
556
  key_moments_html = get_key_moments_html(key_moments)
557
  html_content = format_transcript_to_html(formatted_transcript)
558
  simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
559
- mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source)
560
  mind_map = mind_map_json["mind_map"]
561
  mind_map_html = get_mind_map_html(mind_map)
562
- reading_passage_json = get_reading_passage(video_id, formatted_simple_transcript, source)
563
  reading_passage_text = reading_passage_json["reading_passage"]
564
  reading_passage = reading_passage_json["reading_passage"]
565
  meta_data = get_meta_data(video_id)
@@ -703,70 +703,75 @@ def split_data(df_string, word_base=100000):
703
 
704
  return segments
705
 
706
- def generate_content_by_LLM(sys_content, user_content, response_format=None):
707
- # 使用 OpenAI 生成基于上传数据的问题
 
 
 
 
 
 
 
 
 
 
708
 
709
- try:
710
- model = "gpt-4-turbo"
711
- # 使用 OPEN AI 生成 Reading Passage
712
- messages = [
713
- {"role": "system", "content": sys_content},
714
- {"role": "user", "content": user_content}
715
- ]
716
 
717
- request_payload = {
718
- "model": model,
719
- "messages": messages,
720
- "max_tokens": 4000,
721
- "response_format": response_format
722
- }
723
 
724
- if response_format is not None:
725
- request_payload["response_format"] = response_format
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
 
727
- response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
728
- content = response.choices[0].message.content.strip()
729
- except Exception as e:
730
- print(f"Error generating reading passage: {str(e)}")
731
- print("using REDROCK")
732
- # 使用 REDROCK 生成 Reading Passage
733
- messages = [
734
- {"role": "user", "content": user_content}
735
- ]
736
- model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
737
- # model_id = "anthropic.claude-3-haiku-20240307-v1:0"
738
- kwargs = {
739
- "modelId": model_id,
740
- "contentType": "application/json",
741
- "accept": "application/json",
742
- "body": json.dumps({
743
- "anthropic_version": "bedrock-2023-05-31",
744
- "max_tokens": 4000,
745
- "system": sys_content,
746
- "messages": messages
747
- })
748
- }
749
- response = BEDROCK_CLIENT.invoke_model(**kwargs)
750
- response_body = json.loads(response.get('body').read())
751
- content = response_body.get('content')[0].get('text')
752
 
 
 
 
 
 
 
 
753
  print("=====content=====")
754
  print(content)
755
  print("=====content=====")
756
 
757
  return content
758
 
759
- def get_reading_passage(video_id, df_string, source):
760
  if source == "gcs":
761
  print("===get_reading_passage on gcs===")
762
- gcs_client = GCS_CLIENT
763
  bucket_name = 'video_ai_assistant'
764
  file_name = f'{video_id}_reading_passage_latex.json'
765
  blob_name = f"{video_id}/{file_name}"
766
  # 检查 reading_passage 是否存在
767
  is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
768
  if not is_file_exists:
769
- reading_passage = generate_reading_passage(df_string)
770
  reading_passage_json = {"reading_passage": str(reading_passage)}
771
  reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
772
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
@@ -799,7 +804,7 @@ def get_reading_passage(video_id, df_string, source):
799
 
800
  return reading_passage_json
801
 
802
- def generate_reading_passage(df_string):
803
  print("===generate_reading_passage===")
804
  segments = split_data(df_string, word_base=100000)
805
  all_content = []
@@ -818,7 +823,7 @@ def generate_reading_passage(df_string):
818
  加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
819
  請直接給出文章,不用介紹怎麼處理的或是文章字數等等
820
  """
821
- content = generate_content_by_LLM(sys_content, user_content)
822
  all_content.append(content + "\n")
823
 
824
  # 將所有生成的閱讀理解段落合併成一個完整的文章
@@ -831,7 +836,7 @@ def text_to_speech(video_id, text):
831
  tts.save(filename)
832
  return filename
833
 
834
- def get_mind_map(video_id, df_string, source):
835
  if source == "gcs":
836
  print("===get_mind_map on gcs===")
837
  gcs_client = GCS_CLIENT
@@ -841,7 +846,7 @@ def get_mind_map(video_id, df_string, source):
841
  # 检查檔案是否存在
842
  is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
843
  if not is_file_exists:
844
- mind_map = generate_mind_map(df_string)
845
  mind_map_json = {"mind_map": str(mind_map)}
846
  mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
847
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
@@ -862,7 +867,7 @@ def get_mind_map(video_id, df_string, source):
862
  # 检查檔案是否存在
863
  exists, file_id = check_file_exists(service, folder_id, file_name)
864
  if not exists:
865
- mind_map = generate_mind_map(df_string)
866
  mind_map_json = {"mind_map": str(mind_map)}
867
  mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
868
  upload_content_directly(service, file_name, folder_id, mind_map_text)
@@ -875,7 +880,7 @@ def get_mind_map(video_id, df_string, source):
875
 
876
  return mind_map_json
877
 
878
- def generate_mind_map(df_string):
879
  print("===generate_mind_map===")
880
  segments = split_data(df_string, word_base=100000)
881
  all_content = []
@@ -887,7 +892,7 @@ def generate_mind_map(df_string):
887
  注意:不需要前後文敘述,直接給出 markdown 文本即可
888
  這對我很重要
889
  """
890
- content = generate_content_by_LLM(sys_content, user_content)
891
  all_content.append(content + "\n")
892
 
893
  # 將所有生成的閱讀理解段落合併成一個完整的文章
@@ -906,10 +911,9 @@ def get_mind_map_html(mind_map):
906
  """
907
  return mind_map_html
908
 
909
- def get_video_id_summary(video_id, df_string, source):
910
  if source == "gcs":
911
  print("===get_video_id_summary on gcs===")
912
- gcs_client = GCS_CLIENT
913
  bucket_name = 'video_ai_assistant'
914
  file_name = f'{video_id}_summary_markdown.json'
915
  summary_file_blob_name = f"{video_id}/{file_name}"
@@ -917,7 +921,7 @@ def get_video_id_summary(video_id, df_string, source):
917
  is_summary_file_exists = GCS_SERVICE.check_file_exists(bucket_name, summary_file_blob_name)
918
  if not is_summary_file_exists:
919
  meta_data = get_meta_data(video_id)
920
- summary = generate_summarise(df_string, meta_data)
921
  summary_json = {"summary": str(summary)}
922
  summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
923
  GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
@@ -939,7 +943,7 @@ def get_video_id_summary(video_id, df_string, source):
939
  exists, file_id = check_file_exists(service, folder_id, file_name)
940
  if not exists:
941
  meta_data = get_meta_data(video_id)
942
- summary = generate_summarise(df_string, meta_data)
943
  summary_json = {"summary": str(summary)}
944
  summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
945
 
@@ -960,7 +964,7 @@ def get_video_id_summary(video_id, df_string, source):
960
 
961
  return summary_json
962
 
963
- def generate_summarise(df_string, metadata=None):
964
  print("===generate_summarise===")
965
  # 使用 OpenAI 生成基于上传数据的问题
966
  if metadata:
@@ -1008,7 +1012,7 @@ def generate_summarise(df_string, metadata=None):
1008
  ## ❓ 延伸小問題
1009
  - (一個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
1010
  """
1011
- content = generate_content_by_LLM(sys_content, user_content)
1012
  all_content.append(content + "\n")
1013
 
1014
  if len(all_content) > 1:
@@ -1047,13 +1051,13 @@ def generate_summarise(df_string, metadata=None):
1047
  ## ❓ 延伸小問題
1048
  - ( {all_content_cnt} 個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
1049
  """
1050
- final_content = generate_content_by_LLM(sys_content, user_content)
1051
  else:
1052
  final_content = all_content[0]
1053
 
1054
  return final_content
1055
 
1056
- def get_questions(video_id, df_string, source="gcs"):
1057
  if source == "gcs":
1058
  # 去 gcs 確認是有有 video_id_questions.json
1059
  print("===get_questions on gcs===")
@@ -1064,7 +1068,7 @@ def get_questions(video_id, df_string, source="gcs"):
1064
  # 检查檔案是否存在
1065
  is_questions_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1066
  if not is_questions_exists:
1067
- questions = generate_questions(df_string)
1068
  questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
1069
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
1070
  print("questions已上傳到GCS")
@@ -1085,7 +1089,7 @@ def get_questions(video_id, df_string, source="gcs"):
1085
  # 检查檔案是否存在
1086
  exists, file_id = check_file_exists(service, folder_id, file_name)
1087
  if not exists:
1088
- questions = generate_questions(df_string)
1089
  questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
1090
  upload_content_directly(service, file_name, folder_id, questions_text)
1091
  print("questions已上傳到Google Drive")
@@ -1105,7 +1109,7 @@ def get_questions(video_id, df_string, source="gcs"):
1105
  print("=====get_questions=====")
1106
  return q1, q2, q3
1107
 
1108
- def generate_questions(df_string):
1109
  print("===generate_questions===")
1110
  # 使用 OpenAI 生成基于上传数据的问题
1111
  if isinstance(df_string, str):
@@ -1128,69 +1132,26 @@ def generate_questions(df_string):
1128
  [q1的敘述text, q2的敘述text, q3的敘述text]
1129
  }}
1130
  """
1131
-
1132
- try:
1133
- model = "gpt-4-turbo"
1134
- messages = [
1135
- {"role": "system", "content": sys_content},
1136
- {"role": "user", "content": user_content}
1137
- ]
1138
- response_format = { "type": "json_object" }
1139
-
1140
- print("=====messages=====")
1141
- print(messages)
1142
- print("=====messages=====")
1143
-
1144
-
1145
- request_payload = {
1146
- "model": model,
1147
- "messages": messages,
1148
- "max_tokens": 4000,
1149
- "response_format": response_format
1150
- }
1151
-
1152
- response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
1153
- questions = json.loads(response.choices[0].message.content)["questions"]
1154
- except:
1155
- messages = [
1156
- {"role": "user", "content": user_content}
1157
- ]
1158
- model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
1159
- # model_id = "anthropic.claude-3-haiku-20240307-v1:0"
1160
- kwargs = {
1161
- "modelId": model_id,
1162
- "contentType": "application/json",
1163
- "accept": "application/json",
1164
- "body": json.dumps({
1165
- "anthropic_version": "bedrock-2023-05-31",
1166
- "max_tokens": 4000,
1167
- "system": sys_content,
1168
- "messages": messages
1169
- })
1170
- }
1171
- response = BEDROCK_CLIENT.invoke_model(**kwargs)
1172
- response_body = json.loads(response.get('body').read())
1173
- response_completion = response_body.get('content')[0].get('text')
1174
- questions = json.loads(response_completion)["questions"]
1175
-
1176
  print("=====json_response=====")
1177
- print(questions)
1178
  print("=====json_response=====")
1179
 
1180
- return questions
1181
 
1182
- def get_questions_answers(video_id, df_string, source="gcs"):
1183
  if source == "gcs":
1184
  try:
1185
  print("===get_questions_answers on gcs===")
1186
- gcs_client = GCS_CLIENT
1187
  bucket_name = 'video_ai_assistant'
1188
  file_name = f'{video_id}_questions_answers.json'
1189
  blob_name = f"{video_id}/{file_name}"
1190
  # 检查檔案是否存在
1191
  is_questions_answers_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1192
  if not is_questions_answers_exists:
1193
- questions_answers = generate_questions_answers(df_string)
1194
  questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
1195
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
1196
  print("questions_answers已上傳到GCS")
@@ -1201,12 +1162,12 @@ def get_questions_answers(video_id, df_string, source="gcs"):
1201
  questions_answers = json.loads(questions_answers_text)
1202
  except Exception as e:
1203
  print(f"Error getting questions_answers: {str(e)}")
1204
- questions = get_questions(video_id, df_string, source)
1205
- questions_answers = [{"question": q, "answer": ""} for q in questions]
1206
 
1207
  return questions_answers
1208
 
1209
- def generate_questions_answers(df_string):
1210
  print("===generate_questions_answers===")
1211
  segments = split_data(df_string, word_base=100000)
1212
  all_content = []
@@ -1232,7 +1193,7 @@ def generate_questions_answers(df_string):
1232
  }}
1233
  """
1234
  response_format = { "type": "json_object" }
1235
- content = generate_content_by_LLM(sys_content, user_content, response_format)
1236
  content_json = json.loads(content)["questions_answers"]
1237
  all_content += content_json
1238
 
@@ -1256,7 +1217,7 @@ def change_questions(password, df_string):
1256
  print("=====get_questions=====")
1257
  return q1, q2, q3
1258
 
1259
- def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source):
1260
  if source == "gcs":
1261
  print("===get_key_moments on gcs===")
1262
  gcs_client = GCS_CLIENT
@@ -1266,7 +1227,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
1266
  # 检查檔案是否存在
1267
  is_key_moments_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1268
  if not is_key_moments_exists:
1269
- key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
1270
  key_moments_json = {"key_moments": key_moments}
1271
  key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
1272
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
@@ -1282,7 +1243,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
1282
  for key_moment in key_moments_json["key_moments"]:
1283
  if "keywords" not in key_moment:
1284
  transcript = key_moment["transcript"]
1285
- key_moment["keywords"] = generate_key_moments_keywords(transcript)
1286
  print("===keywords===")
1287
  print(key_moment["keywords"])
1288
  print("===keywords===")
@@ -1303,7 +1264,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
1303
  # 检查檔案是否存在
1304
  exists, file_id = check_file_exists(service, folder_id, file_name)
1305
  if not exists:
1306
- key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
1307
  key_moments_json = {"key_moments": key_moments}
1308
  key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
1309
  upload_content_directly(service, file_name, folder_id, key_moments_text)
@@ -1316,7 +1277,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
1316
 
1317
  return key_moments_json
1318
 
1319
- def generate_key_moments(formatted_simple_transcript, formatted_transcript):
1320
  print("===generate_key_moments===")
1321
  segments = split_data(formatted_simple_transcript, word_base=100000)
1322
  all_content = []
@@ -1343,7 +1304,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
1343
  }}
1344
  """
1345
  response_format = { "type": "json_object" }
1346
- content = generate_content_by_LLM(sys_content, user_content, response_format)
1347
  key_moments = json.loads(content)["key_moments"]
1348
 
1349
  # "transcript": get text from formatted_simple_transcript
@@ -1371,7 +1332,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
1371
 
1372
  return all_content
1373
 
1374
- def generate_key_moments_keywords(transcript):
1375
  print("===generate_key_moments_keywords===")
1376
  segments = split_data(transcript, word_base=100000)
1377
  all_content = []
@@ -1384,7 +1345,7 @@ def generate_key_moments_keywords(transcript):
1384
  不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
1385
  transcript:{segment}
1386
  """
1387
- content = generate_content_by_LLM(sys_content, user_content)
1388
  keywords = content.strip().split(",")
1389
  all_content += keywords
1390
 
@@ -1665,7 +1626,6 @@ def delete_LLM_content(video_id, kind):
1665
 
1666
  def update_LLM_content(video_id, new_content, kind):
1667
  print(f"===upfdate kind on gcs===")
1668
- gcs_client = GCS_CLIENT
1669
  bucket_name = 'video_ai_assistant'
1670
  file_name = f'{video_id}_{kind}.json'
1671
  blob_name = f"{video_id}/{file_name}"
@@ -1739,16 +1699,16 @@ def update_LLM_content(video_id, new_content, kind):
1739
  print(f"{kind} 已更新到GCS")
1740
  return gr.update(value=updated_content, interactive=False)
1741
 
1742
- def create_LLM_content(video_id, df_string, kind):
1743
  print(f"===create_{kind}===")
1744
  print(f"video_id: {video_id}")
1745
 
1746
  if kind == "reading_passage_latex":
1747
- content = generate_reading_passage(df_string)
1748
  update_LLM_content(video_id, content, kind)
1749
  elif kind == "summary_markdown":
1750
  meta_data = get_meta_data(video_id)
1751
- content = generate_summarise(df_string, meta_data)
1752
  update_LLM_content(video_id, content, kind)
1753
  elif kind == "mind_map":
1754
  content = generate_mind_map(df_string)
@@ -1760,7 +1720,7 @@ def create_LLM_content(video_id, df_string, kind):
1760
  transcript = df_string
1761
  formatted_simple_transcript = create_formatted_simple_transcript(transcript)
1762
  formatted_transcript = create_formatted_transcript(video_id, transcript)
1763
- gen_content = generate_key_moments(formatted_simple_transcript, formatted_transcript)
1764
  update_LLM_content(video_id, gen_content, kind)
1765
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1766
  elif kind == "transcript":
@@ -1768,7 +1728,7 @@ def create_LLM_content(video_id, df_string, kind):
1768
  update_LLM_content(video_id, gen_content, kind)
1769
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1770
  elif kind == "questions":
1771
- gen_content = generate_questions(df_string)
1772
  update_LLM_content(video_id, gen_content, kind)
1773
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1774
  elif kind == "questions_answers":
@@ -1777,7 +1737,7 @@ def create_LLM_content(video_id, df_string, kind):
1777
  else:
1778
  transcript = df_string
1779
  formatted_simple_transcript = create_formatted_simple_transcript(transcript)
1780
- gen_content = generate_questions_answers(formatted_simple_transcript)
1781
  update_LLM_content(video_id, gen_content, kind)
1782
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1783
 
@@ -2690,14 +2650,20 @@ HEAD = """
2690
 
2691
  with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, text_size = gr.themes.sizes.text_lg), head=HEAD) as demo:
2692
  with gr.Row() as admin:
2693
- password = gr.Textbox(label="Password", type="password", elem_id="password_input", visible=True)
2694
- youtube_link = gr.Textbox(label="Enter YouTube Link", elem_id="youtube_link_input", visible=True)
2695
- video_id = gr.Textbox(label="video_id", visible=True)
2696
- # file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
2697
- # web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
2698
- user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
2699
- youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
2700
- is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
 
 
 
 
 
 
2701
  with gr.Row() as data_state:
2702
  content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
2703
  content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
@@ -3170,7 +3136,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3170
  )
3171
 
3172
  # 当输入 YouTube 链接时触发
3173
- process_youtube_link_inputs = [password, youtube_link]
3174
  process_youtube_link_outputs = [
3175
  video_id,
3176
  questions_answers_json,
@@ -3251,7 +3217,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3251
  {
3252
  'button': transcript_create_button,
3253
  'action': create_LLM_content,
3254
- 'inputs': [video_id, df_string_output, transcript_kind],
3255
  'outputs': [df_string_output]
3256
  },
3257
  {
@@ -3282,7 +3248,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3282
  {
3283
  'button': reading_passage_create_button,
3284
  'action': create_LLM_content,
3285
- 'inputs': [video_id, df_string_output, reading_passage_kind],
3286
  'outputs': [reading_passage_text]
3287
  },
3288
  {
@@ -3313,7 +3279,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3313
  {
3314
  'button': summary_create_button,
3315
  'action': create_LLM_content,
3316
- 'inputs': [video_id, df_string_output, summary_kind],
3317
  'outputs': [summary_text]
3318
  },
3319
  {
@@ -3344,7 +3310,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3344
  {
3345
  'button': key_moments_create_button,
3346
  'action': create_LLM_content,
3347
- 'inputs': [video_id, df_string_output, key_moments_kind],
3348
  'outputs': [key_moments]
3349
  },
3350
  {
@@ -3375,7 +3341,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3375
  {
3376
  'button': questions_create_button,
3377
  'action': create_LLM_content,
3378
- 'inputs': [video_id, df_string_output, questions_kind],
3379
  'outputs': [questions_json]
3380
  },
3381
  {
@@ -3406,7 +3372,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3406
  {
3407
  'button': questions_answers_create_button,
3408
  'action': create_LLM_content,
3409
- 'inputs': [video_id, df_string_output, questions_answers_kind],
3410
  'outputs': [questions_answers_json]
3411
  },
3412
  {
@@ -3437,7 +3403,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
3437
  {
3438
  'button': worksheet_create_button,
3439
  'action': create_LLM_content,
3440
- 'inputs': [video_id, df_string_output, worksheet_kind],
3441
  'outputs': [worksheet_json]
3442
  },
3443
  {
 
503
  GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
504
  print("Transcript uploaded successfully.")
505
 
506
+ def process_youtube_link(password, link, LLM_model=None):
507
  verify_password(password)
508
  video_id = extract_youtube_id(link)
509
 
 
545
 
546
  # 基于逐字稿生成其他所需的输出
547
  source = "gcs"
548
+ questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source, LLM_model)
549
  questions_answers_json = json.dumps(questions_answers, ensure_ascii=False, indent=2)
550
+ summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source, LLM_model)
551
  summary_text = summary_json["summary"]
552
  summary = summary_json["summary"]
553
+ key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source, LLM_model)
554
  key_moments = key_moments_json["key_moments"]
555
  key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
556
  key_moments_html = get_key_moments_html(key_moments)
557
  html_content = format_transcript_to_html(formatted_transcript)
558
  simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
559
+ mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source, LLM_model)
560
  mind_map = mind_map_json["mind_map"]
561
  mind_map_html = get_mind_map_html(mind_map)
562
+ reading_passage_json = get_reading_passage(video_id, formatted_simple_transcript, source, LLM_model)
563
  reading_passage_text = reading_passage_json["reading_passage"]
564
  reading_passage = reading_passage_json["reading_passage"]
565
  meta_data = get_meta_data(video_id)
 
703
 
704
  return segments
705
 
706
+ def generate_content_by_open_ai(sys_content, user_content, response_format=None):
707
+ print("LLM using OPEN AI")
708
+ model = "gpt-4-turbo"
709
+ messages = [
710
+ {"role": "system", "content": sys_content},
711
+ {"role": "user", "content": user_content}
712
+ ]
713
+ request_payload = {
714
+ "model": model,
715
+ "messages": messages,
716
+ "max_tokens": 4000,
717
+ }
718
 
719
+ if response_format is not None:
720
+ request_payload["response_format"] = response_format
 
 
 
 
 
721
 
722
+ response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
723
+ content = response.choices[0].message.content.strip()
724
+ return content
 
 
 
725
 
726
+ def generate_content_by_bedrock(sys_content, user_content):
727
+ print("LLM using REDROCK")
728
+ messages = [
729
+ {"role": "user", "content": user_content +"(如果是 JSON 格式,value 的引號,請用單引號,或是用反斜線+雙引號,避免 JSON Decoder error )"}
730
+ ]
731
+ model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
732
+ # model_id = "anthropic.claude-3-haiku-20240307-v1:0"
733
+ kwargs = {
734
+ "modelId": model_id,
735
+ "contentType": "application/json",
736
+ "accept": "application/json",
737
+ "body": json.dumps({
738
+ "anthropic_version": "bedrock-2023-05-31",
739
+ "max_tokens": 4000,
740
+ "system": sys_content,
741
+ "messages": messages
742
+ })
743
+ }
744
+ response = BEDROCK_CLIENT.invoke_model(**kwargs)
745
+ response_body = json.loads(response.get('body').read())
746
+ content = response_body.get('content')[0].get('text')
747
+ return content
748
 
749
+ def generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=None):
750
+ # 使用 OpenAI 生成基于上传数据的问题
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
751
 
752
+ if LLM_model == "anthropic-claude-3-sonnet":
753
+ print(f"LLM: {LLM_model}")
754
+ content = generate_content_by_bedrock(sys_content, user_content)
755
+ else:
756
+ print(f"LLM: {LLM_model}")
757
+ content = generate_content_by_open_ai(sys_content, user_content, response_format)
758
+
759
  print("=====content=====")
760
  print(content)
761
  print("=====content=====")
762
 
763
  return content
764
 
765
+ def get_reading_passage(video_id, df_string, source, LLM_model=None):
766
  if source == "gcs":
767
  print("===get_reading_passage on gcs===")
 
768
  bucket_name = 'video_ai_assistant'
769
  file_name = f'{video_id}_reading_passage_latex.json'
770
  blob_name = f"{video_id}/{file_name}"
771
  # 检查 reading_passage 是否存在
772
  is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
773
  if not is_file_exists:
774
+ reading_passage = generate_reading_passage(df_string, LLM_model)
775
  reading_passage_json = {"reading_passage": str(reading_passage)}
776
  reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
777
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
 
804
 
805
  return reading_passage_json
806
 
807
+ def generate_reading_passage(df_string, LLM_model=None):
808
  print("===generate_reading_passage===")
809
  segments = split_data(df_string, word_base=100000)
810
  all_content = []
 
823
  加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
824
  請直接給出文章,不用介紹怎麼處理的或是文章字數等等
825
  """
826
+ content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
827
  all_content.append(content + "\n")
828
 
829
  # 將所有生成的閱讀理解段落合併成一個完整的文章
 
836
  tts.save(filename)
837
  return filename
838
 
839
+ def get_mind_map(video_id, df_string, source, LLM_model=None):
840
  if source == "gcs":
841
  print("===get_mind_map on gcs===")
842
  gcs_client = GCS_CLIENT
 
846
  # 检查檔案是否存在
847
  is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
848
  if not is_file_exists:
849
+ mind_map = generate_mind_map(df_string, LLM_model)
850
  mind_map_json = {"mind_map": str(mind_map)}
851
  mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
852
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
 
867
  # 检查檔案是否存在
868
  exists, file_id = check_file_exists(service, folder_id, file_name)
869
  if not exists:
870
+ mind_map = generate_mind_map(df_string, LLM_model)
871
  mind_map_json = {"mind_map": str(mind_map)}
872
  mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
873
  upload_content_directly(service, file_name, folder_id, mind_map_text)
 
880
 
881
  return mind_map_json
882
 
883
+ def generate_mind_map(df_string, LLM_model=None):
884
  print("===generate_mind_map===")
885
  segments = split_data(df_string, word_base=100000)
886
  all_content = []
 
892
  注意:不需要前後文敘述,直接給出 markdown 文本即可
893
  這對我很重要
894
  """
895
+ content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
896
  all_content.append(content + "\n")
897
 
898
  # 將所有生成的閱讀理解段落合併成一個完整的文章
 
911
  """
912
  return mind_map_html
913
 
914
+ def get_video_id_summary(video_id, df_string, source, LLM_model=None):
915
  if source == "gcs":
916
  print("===get_video_id_summary on gcs===")
 
917
  bucket_name = 'video_ai_assistant'
918
  file_name = f'{video_id}_summary_markdown.json'
919
  summary_file_blob_name = f"{video_id}/{file_name}"
 
921
  is_summary_file_exists = GCS_SERVICE.check_file_exists(bucket_name, summary_file_blob_name)
922
  if not is_summary_file_exists:
923
  meta_data = get_meta_data(video_id)
924
+ summary = generate_summarise(df_string, meta_data, LLM_model)
925
  summary_json = {"summary": str(summary)}
926
  summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
927
  GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
 
943
  exists, file_id = check_file_exists(service, folder_id, file_name)
944
  if not exists:
945
  meta_data = get_meta_data(video_id)
946
+ summary = generate_summarise(df_string, meta_data, LLM_model)
947
  summary_json = {"summary": str(summary)}
948
  summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
949
 
 
964
 
965
  return summary_json
966
 
967
+ def generate_summarise(df_string, metadata=None, LLM_model=None):
968
  print("===generate_summarise===")
969
  # 使用 OpenAI 生成基于上传数据的问题
970
  if metadata:
 
1012
  ## ❓ 延伸小問題
1013
  - (一個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
1014
  """
1015
+ content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
1016
  all_content.append(content + "\n")
1017
 
1018
  if len(all_content) > 1:
 
1051
  ## ❓ 延伸小問題
1052
  - ( {all_content_cnt} 個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
1053
  """
1054
+ final_content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
1055
  else:
1056
  final_content = all_content[0]
1057
 
1058
  return final_content
1059
 
1060
+ def get_questions(video_id, df_string, source="gcs", LLM_model=None):
1061
  if source == "gcs":
1062
  # 去 gcs 確認是有有 video_id_questions.json
1063
  print("===get_questions on gcs===")
 
1068
  # 检查檔案是否存在
1069
  is_questions_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1070
  if not is_questions_exists:
1071
+ questions = generate_questions(df_string, LLM_model)
1072
  questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
1073
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
1074
  print("questions已上傳到GCS")
 
1089
  # 检查檔案是否存在
1090
  exists, file_id = check_file_exists(service, folder_id, file_name)
1091
  if not exists:
1092
+ questions = generate_questions(df_string, LLM_model)
1093
  questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
1094
  upload_content_directly(service, file_name, folder_id, questions_text)
1095
  print("questions已上傳到Google Drive")
 
1109
  print("=====get_questions=====")
1110
  return q1, q2, q3
1111
 
1112
+ def generate_questions(df_string, LLM_model=None):
1113
  print("===generate_questions===")
1114
  # 使用 OpenAI 生成基于上传数据的问题
1115
  if isinstance(df_string, str):
 
1132
  [q1的敘述text, q2的敘述text, q3的敘述text]
1133
  }}
1134
  """
1135
+ response_format = { "type": "json_object" }
1136
+ questions = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
1137
+ questions_list = json.loads(questions)["questions"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1138
  print("=====json_response=====")
1139
+ print(questions_list)
1140
  print("=====json_response=====")
1141
 
1142
+ return questions_list
1143
 
1144
+ def get_questions_answers(video_id, df_string, source="gcs", LLM_model=None):
1145
  if source == "gcs":
1146
  try:
1147
  print("===get_questions_answers on gcs===")
 
1148
  bucket_name = 'video_ai_assistant'
1149
  file_name = f'{video_id}_questions_answers.json'
1150
  blob_name = f"{video_id}/{file_name}"
1151
  # 检查檔案是否存在
1152
  is_questions_answers_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1153
  if not is_questions_answers_exists:
1154
+ questions_answers = generate_questions_answers(df_string, LLM_model)
1155
  questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
1156
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
1157
  print("questions_answers已上傳到GCS")
 
1162
  questions_answers = json.loads(questions_answers_text)
1163
  except Exception as e:
1164
  print(f"Error getting questions_answers: {str(e)}")
1165
+ questions_list = get_questions(video_id, df_string, source, LLM_model)
1166
+ questions_answers = [{"question": q, "answer": ""} for q in questions_list]
1167
 
1168
  return questions_answers
1169
 
1170
+ def generate_questions_answers(df_string, LLM_model=None):
1171
  print("===generate_questions_answers===")
1172
  segments = split_data(df_string, word_base=100000)
1173
  all_content = []
 
1193
  }}
1194
  """
1195
  response_format = { "type": "json_object" }
1196
+ content = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
1197
  content_json = json.loads(content)["questions_answers"]
1198
  all_content += content_json
1199
 
 
1217
  print("=====get_questions=====")
1218
  return q1, q2, q3
1219
 
1220
+ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source, LLM_model=None):
1221
  if source == "gcs":
1222
  print("===get_key_moments on gcs===")
1223
  gcs_client = GCS_CLIENT
 
1227
  # 检查檔案是否存在
1228
  is_key_moments_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
1229
  if not is_key_moments_exists:
1230
+ key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
1231
  key_moments_json = {"key_moments": key_moments}
1232
  key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
1233
  GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
 
1243
  for key_moment in key_moments_json["key_moments"]:
1244
  if "keywords" not in key_moment:
1245
  transcript = key_moment["transcript"]
1246
+ key_moment["keywords"] = generate_key_moments_keywords(transcript, LLM_model)
1247
  print("===keywords===")
1248
  print(key_moment["keywords"])
1249
  print("===keywords===")
 
1264
  # 检查檔案是否存在
1265
  exists, file_id = check_file_exists(service, folder_id, file_name)
1266
  if not exists:
1267
+ key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
1268
  key_moments_json = {"key_moments": key_moments}
1269
  key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
1270
  upload_content_directly(service, file_name, folder_id, key_moments_text)
 
1277
 
1278
  return key_moments_json
1279
 
1280
+ def generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model=None):
1281
  print("===generate_key_moments===")
1282
  segments = split_data(formatted_simple_transcript, word_base=100000)
1283
  all_content = []
 
1304
  }}
1305
  """
1306
  response_format = { "type": "json_object" }
1307
+ content = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
1308
  key_moments = json.loads(content)["key_moments"]
1309
 
1310
  # "transcript": get text from formatted_simple_transcript
 
1332
 
1333
  return all_content
1334
 
1335
+ def generate_key_moments_keywords(transcript, LLM_model=None):
1336
  print("===generate_key_moments_keywords===")
1337
  segments = split_data(transcript, word_base=100000)
1338
  all_content = []
 
1345
  不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
1346
  transcript:{segment}
1347
  """
1348
+ content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
1349
  keywords = content.strip().split(",")
1350
  all_content += keywords
1351
 
 
1626
 
1627
  def update_LLM_content(video_id, new_content, kind):
1628
  print(f"===upfdate kind on gcs===")
 
1629
  bucket_name = 'video_ai_assistant'
1630
  file_name = f'{video_id}_{kind}.json'
1631
  blob_name = f"{video_id}/{file_name}"
 
1699
  print(f"{kind} 已更新到GCS")
1700
  return gr.update(value=updated_content, interactive=False)
1701
 
1702
+ def create_LLM_content(video_id, df_string, kind, LLM_model=None):
1703
  print(f"===create_{kind}===")
1704
  print(f"video_id: {video_id}")
1705
 
1706
  if kind == "reading_passage_latex":
1707
+ content = generate_reading_passage(df_string, LLM_model)
1708
  update_LLM_content(video_id, content, kind)
1709
  elif kind == "summary_markdown":
1710
  meta_data = get_meta_data(video_id)
1711
+ content = generate_summarise(df_string, meta_data, LLM_model)
1712
  update_LLM_content(video_id, content, kind)
1713
  elif kind == "mind_map":
1714
  content = generate_mind_map(df_string)
 
1720
  transcript = df_string
1721
  formatted_simple_transcript = create_formatted_simple_transcript(transcript)
1722
  formatted_transcript = create_formatted_transcript(video_id, transcript)
1723
+ gen_content = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
1724
  update_LLM_content(video_id, gen_content, kind)
1725
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1726
  elif kind == "transcript":
 
1728
  update_LLM_content(video_id, gen_content, kind)
1729
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1730
  elif kind == "questions":
1731
+ gen_content = generate_questions(df_string, LLM_model)
1732
  update_LLM_content(video_id, gen_content, kind)
1733
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1734
  elif kind == "questions_answers":
 
1737
  else:
1738
  transcript = df_string
1739
  formatted_simple_transcript = create_formatted_simple_transcript(transcript)
1740
+ gen_content = generate_questions_answers(formatted_simple_transcript, LLM_model)
1741
  update_LLM_content(video_id, gen_content, kind)
1742
  content = json.dumps(gen_content, ensure_ascii=False, indent=2)
1743
 
 
2650
 
2651
  with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, text_size = gr.themes.sizes.text_lg), head=HEAD) as demo:
2652
  with gr.Row() as admin:
2653
+ with gr.Column(scale=4):
2654
+ with gr.Row():
2655
+ password = gr.Textbox(label="Password", type="password", elem_id="password_input", visible=True)
2656
+ youtube_link = gr.Textbox(label="Enter YouTube Link", elem_id="youtube_link_input", visible=True)
2657
+ video_id = gr.Textbox(label="video_id", visible=True)
2658
+ # file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
2659
+ # web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
2660
+ user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
2661
+ with gr.Row():
2662
+ is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
2663
+ LLM_model = gr.Dropdown(label="LLM Model", choices=["open-ai-gpt-4", "anthropic-claude-3-sonnet"], value="open-ai-gpt-4", visible=True, interactive=True)
2664
+ with gr.Column(scale=1):
2665
+ with gr.Row():
2666
+ youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
2667
  with gr.Row() as data_state:
2668
  content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
2669
  content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
 
3136
  )
3137
 
3138
  # 当输入 YouTube 链接时触发
3139
+ process_youtube_link_inputs = [password, youtube_link, LLM_model]
3140
  process_youtube_link_outputs = [
3141
  video_id,
3142
  questions_answers_json,
 
3217
  {
3218
  'button': transcript_create_button,
3219
  'action': create_LLM_content,
3220
+ 'inputs': [video_id, df_string_output, transcript_kind, LLM_model],
3221
  'outputs': [df_string_output]
3222
  },
3223
  {
 
3248
  {
3249
  'button': reading_passage_create_button,
3250
  'action': create_LLM_content,
3251
+ 'inputs': [video_id, df_string_output, reading_passage_kind, LLM_model],
3252
  'outputs': [reading_passage_text]
3253
  },
3254
  {
 
3279
  {
3280
  'button': summary_create_button,
3281
  'action': create_LLM_content,
3282
+ 'inputs': [video_id, df_string_output, summary_kind, LLM_model],
3283
  'outputs': [summary_text]
3284
  },
3285
  {
 
3310
  {
3311
  'button': key_moments_create_button,
3312
  'action': create_LLM_content,
3313
+ 'inputs': [video_id, df_string_output, key_moments_kind, LLM_model],
3314
  'outputs': [key_moments]
3315
  },
3316
  {
 
3341
  {
3342
  'button': questions_create_button,
3343
  'action': create_LLM_content,
3344
+ 'inputs': [video_id, df_string_output, questions_kind, LLM_model],
3345
  'outputs': [questions_json]
3346
  },
3347
  {
 
3372
  {
3373
  'button': questions_answers_create_button,
3374
  'action': create_LLM_content,
3375
+ 'inputs': [video_id, df_string_output, questions_answers_kind, LLM_model],
3376
  'outputs': [questions_answers_json]
3377
  },
3378
  {
 
3403
  {
3404
  'button': worksheet_create_button,
3405
  'action': create_LLM_content,
3406
+ 'inputs': [video_id, df_string_output, worksheet_kind, LLM_model],
3407
  'outputs': [worksheet_json]
3408
  },
3409
  {