Spaces:
Running
Running
update
Browse files
app.py
CHANGED
@@ -503,7 +503,7 @@ def upload_transcript_to_gcs(video_id, transcript):
|
|
503 |
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
504 |
print("Transcript uploaded successfully.")
|
505 |
|
506 |
-
def process_youtube_link(password, link):
|
507 |
verify_password(password)
|
508 |
video_id = extract_youtube_id(link)
|
509 |
|
@@ -545,21 +545,21 @@ def process_youtube_link(password, link):
|
|
545 |
|
546 |
# 基于逐字稿生成其他所需的输出
|
547 |
source = "gcs"
|
548 |
-
questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source)
|
549 |
questions_answers_json = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
550 |
-
summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source)
|
551 |
summary_text = summary_json["summary"]
|
552 |
summary = summary_json["summary"]
|
553 |
-
key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source)
|
554 |
key_moments = key_moments_json["key_moments"]
|
555 |
key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
|
556 |
key_moments_html = get_key_moments_html(key_moments)
|
557 |
html_content = format_transcript_to_html(formatted_transcript)
|
558 |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
|
559 |
-
mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source)
|
560 |
mind_map = mind_map_json["mind_map"]
|
561 |
mind_map_html = get_mind_map_html(mind_map)
|
562 |
-
reading_passage_json = get_reading_passage(video_id, formatted_simple_transcript, source)
|
563 |
reading_passage_text = reading_passage_json["reading_passage"]
|
564 |
reading_passage = reading_passage_json["reading_passage"]
|
565 |
meta_data = get_meta_data(video_id)
|
@@ -703,70 +703,75 @@ def split_data(df_string, word_base=100000):
|
|
703 |
|
704 |
return segments
|
705 |
|
706 |
-
def
|
707 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
708 |
|
709 |
-
|
710 |
-
|
711 |
-
# 使用 OPEN AI 生成 Reading Passage
|
712 |
-
messages = [
|
713 |
-
{"role": "system", "content": sys_content},
|
714 |
-
{"role": "user", "content": user_content}
|
715 |
-
]
|
716 |
|
717 |
-
|
718 |
-
|
719 |
-
|
720 |
-
"max_tokens": 4000,
|
721 |
-
"response_format": response_format
|
722 |
-
}
|
723 |
|
724 |
-
|
725 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
726 |
|
727 |
-
|
728 |
-
|
729 |
-
except Exception as e:
|
730 |
-
print(f"Error generating reading passage: {str(e)}")
|
731 |
-
print("using REDROCK")
|
732 |
-
# 使用 REDROCK 生成 Reading Passage
|
733 |
-
messages = [
|
734 |
-
{"role": "user", "content": user_content}
|
735 |
-
]
|
736 |
-
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
|
737 |
-
# model_id = "anthropic.claude-3-haiku-20240307-v1:0"
|
738 |
-
kwargs = {
|
739 |
-
"modelId": model_id,
|
740 |
-
"contentType": "application/json",
|
741 |
-
"accept": "application/json",
|
742 |
-
"body": json.dumps({
|
743 |
-
"anthropic_version": "bedrock-2023-05-31",
|
744 |
-
"max_tokens": 4000,
|
745 |
-
"system": sys_content,
|
746 |
-
"messages": messages
|
747 |
-
})
|
748 |
-
}
|
749 |
-
response = BEDROCK_CLIENT.invoke_model(**kwargs)
|
750 |
-
response_body = json.loads(response.get('body').read())
|
751 |
-
content = response_body.get('content')[0].get('text')
|
752 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
753 |
print("=====content=====")
|
754 |
print(content)
|
755 |
print("=====content=====")
|
756 |
|
757 |
return content
|
758 |
|
759 |
-
def get_reading_passage(video_id, df_string, source):
|
760 |
if source == "gcs":
|
761 |
print("===get_reading_passage on gcs===")
|
762 |
-
gcs_client = GCS_CLIENT
|
763 |
bucket_name = 'video_ai_assistant'
|
764 |
file_name = f'{video_id}_reading_passage_latex.json'
|
765 |
blob_name = f"{video_id}/{file_name}"
|
766 |
# 检查 reading_passage 是否存在
|
767 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
768 |
if not is_file_exists:
|
769 |
-
reading_passage = generate_reading_passage(df_string)
|
770 |
reading_passage_json = {"reading_passage": str(reading_passage)}
|
771 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
772 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
|
@@ -799,7 +804,7 @@ def get_reading_passage(video_id, df_string, source):
|
|
799 |
|
800 |
return reading_passage_json
|
801 |
|
802 |
-
def generate_reading_passage(df_string):
|
803 |
print("===generate_reading_passage===")
|
804 |
segments = split_data(df_string, word_base=100000)
|
805 |
all_content = []
|
@@ -818,7 +823,7 @@ def generate_reading_passage(df_string):
|
|
818 |
加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
|
819 |
請直接給出文章,不用介紹怎麼處理的或是文章字數等等
|
820 |
"""
|
821 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
822 |
all_content.append(content + "\n")
|
823 |
|
824 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
@@ -831,7 +836,7 @@ def text_to_speech(video_id, text):
|
|
831 |
tts.save(filename)
|
832 |
return filename
|
833 |
|
834 |
-
def get_mind_map(video_id, df_string, source):
|
835 |
if source == "gcs":
|
836 |
print("===get_mind_map on gcs===")
|
837 |
gcs_client = GCS_CLIENT
|
@@ -841,7 +846,7 @@ def get_mind_map(video_id, df_string, source):
|
|
841 |
# 检查檔案是否存在
|
842 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
843 |
if not is_file_exists:
|
844 |
-
mind_map = generate_mind_map(df_string)
|
845 |
mind_map_json = {"mind_map": str(mind_map)}
|
846 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
847 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
|
@@ -862,7 +867,7 @@ def get_mind_map(video_id, df_string, source):
|
|
862 |
# 检查檔案是否存在
|
863 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
864 |
if not exists:
|
865 |
-
mind_map = generate_mind_map(df_string)
|
866 |
mind_map_json = {"mind_map": str(mind_map)}
|
867 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
868 |
upload_content_directly(service, file_name, folder_id, mind_map_text)
|
@@ -875,7 +880,7 @@ def get_mind_map(video_id, df_string, source):
|
|
875 |
|
876 |
return mind_map_json
|
877 |
|
878 |
-
def generate_mind_map(df_string):
|
879 |
print("===generate_mind_map===")
|
880 |
segments = split_data(df_string, word_base=100000)
|
881 |
all_content = []
|
@@ -887,7 +892,7 @@ def generate_mind_map(df_string):
|
|
887 |
注意:不需要前後文敘述,直接給出 markdown 文本即可
|
888 |
這對我很重要
|
889 |
"""
|
890 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
891 |
all_content.append(content + "\n")
|
892 |
|
893 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
@@ -906,10 +911,9 @@ def get_mind_map_html(mind_map):
|
|
906 |
"""
|
907 |
return mind_map_html
|
908 |
|
909 |
-
def get_video_id_summary(video_id, df_string, source):
|
910 |
if source == "gcs":
|
911 |
print("===get_video_id_summary on gcs===")
|
912 |
-
gcs_client = GCS_CLIENT
|
913 |
bucket_name = 'video_ai_assistant'
|
914 |
file_name = f'{video_id}_summary_markdown.json'
|
915 |
summary_file_blob_name = f"{video_id}/{file_name}"
|
@@ -917,7 +921,7 @@ def get_video_id_summary(video_id, df_string, source):
|
|
917 |
is_summary_file_exists = GCS_SERVICE.check_file_exists(bucket_name, summary_file_blob_name)
|
918 |
if not is_summary_file_exists:
|
919 |
meta_data = get_meta_data(video_id)
|
920 |
-
summary = generate_summarise(df_string, meta_data)
|
921 |
summary_json = {"summary": str(summary)}
|
922 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
923 |
GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
|
@@ -939,7 +943,7 @@ def get_video_id_summary(video_id, df_string, source):
|
|
939 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
940 |
if not exists:
|
941 |
meta_data = get_meta_data(video_id)
|
942 |
-
summary = generate_summarise(df_string, meta_data)
|
943 |
summary_json = {"summary": str(summary)}
|
944 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
945 |
|
@@ -960,7 +964,7 @@ def get_video_id_summary(video_id, df_string, source):
|
|
960 |
|
961 |
return summary_json
|
962 |
|
963 |
-
def generate_summarise(df_string, metadata=None):
|
964 |
print("===generate_summarise===")
|
965 |
# 使用 OpenAI 生成基于上传数据的问题
|
966 |
if metadata:
|
@@ -1008,7 +1012,7 @@ def generate_summarise(df_string, metadata=None):
|
|
1008 |
## ❓ 延伸小問題
|
1009 |
- (一個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
1010 |
"""
|
1011 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
1012 |
all_content.append(content + "\n")
|
1013 |
|
1014 |
if len(all_content) > 1:
|
@@ -1047,13 +1051,13 @@ def generate_summarise(df_string, metadata=None):
|
|
1047 |
## ❓ 延伸小問題
|
1048 |
- ( {all_content_cnt} 個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
1049 |
"""
|
1050 |
-
final_content = generate_content_by_LLM(sys_content, user_content)
|
1051 |
else:
|
1052 |
final_content = all_content[0]
|
1053 |
|
1054 |
return final_content
|
1055 |
|
1056 |
-
def get_questions(video_id, df_string, source="gcs"):
|
1057 |
if source == "gcs":
|
1058 |
# 去 gcs 確認是有有 video_id_questions.json
|
1059 |
print("===get_questions on gcs===")
|
@@ -1064,7 +1068,7 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
1064 |
# 检查檔案是否存在
|
1065 |
is_questions_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1066 |
if not is_questions_exists:
|
1067 |
-
questions = generate_questions(df_string)
|
1068 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
1069 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
|
1070 |
print("questions已上傳到GCS")
|
@@ -1085,7 +1089,7 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
1085 |
# 检查檔案是否存在
|
1086 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
1087 |
if not exists:
|
1088 |
-
questions = generate_questions(df_string)
|
1089 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
1090 |
upload_content_directly(service, file_name, folder_id, questions_text)
|
1091 |
print("questions已上傳到Google Drive")
|
@@ -1105,7 +1109,7 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
1105 |
print("=====get_questions=====")
|
1106 |
return q1, q2, q3
|
1107 |
|
1108 |
-
def generate_questions(df_string):
|
1109 |
print("===generate_questions===")
|
1110 |
# 使用 OpenAI 生成基于上传数据的问题
|
1111 |
if isinstance(df_string, str):
|
@@ -1128,69 +1132,26 @@ def generate_questions(df_string):
|
|
1128 |
[q1的敘述text, q2的敘述text, q3的敘述text]
|
1129 |
}}
|
1130 |
"""
|
1131 |
-
|
1132 |
-
|
1133 |
-
|
1134 |
-
messages = [
|
1135 |
-
{"role": "system", "content": sys_content},
|
1136 |
-
{"role": "user", "content": user_content}
|
1137 |
-
]
|
1138 |
-
response_format = { "type": "json_object" }
|
1139 |
-
|
1140 |
-
print("=====messages=====")
|
1141 |
-
print(messages)
|
1142 |
-
print("=====messages=====")
|
1143 |
-
|
1144 |
-
|
1145 |
-
request_payload = {
|
1146 |
-
"model": model,
|
1147 |
-
"messages": messages,
|
1148 |
-
"max_tokens": 4000,
|
1149 |
-
"response_format": response_format
|
1150 |
-
}
|
1151 |
-
|
1152 |
-
response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
|
1153 |
-
questions = json.loads(response.choices[0].message.content)["questions"]
|
1154 |
-
except:
|
1155 |
-
messages = [
|
1156 |
-
{"role": "user", "content": user_content}
|
1157 |
-
]
|
1158 |
-
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
|
1159 |
-
# model_id = "anthropic.claude-3-haiku-20240307-v1:0"
|
1160 |
-
kwargs = {
|
1161 |
-
"modelId": model_id,
|
1162 |
-
"contentType": "application/json",
|
1163 |
-
"accept": "application/json",
|
1164 |
-
"body": json.dumps({
|
1165 |
-
"anthropic_version": "bedrock-2023-05-31",
|
1166 |
-
"max_tokens": 4000,
|
1167 |
-
"system": sys_content,
|
1168 |
-
"messages": messages
|
1169 |
-
})
|
1170 |
-
}
|
1171 |
-
response = BEDROCK_CLIENT.invoke_model(**kwargs)
|
1172 |
-
response_body = json.loads(response.get('body').read())
|
1173 |
-
response_completion = response_body.get('content')[0].get('text')
|
1174 |
-
questions = json.loads(response_completion)["questions"]
|
1175 |
-
|
1176 |
print("=====json_response=====")
|
1177 |
-
print(
|
1178 |
print("=====json_response=====")
|
1179 |
|
1180 |
-
return
|
1181 |
|
1182 |
-
def get_questions_answers(video_id, df_string, source="gcs"):
|
1183 |
if source == "gcs":
|
1184 |
try:
|
1185 |
print("===get_questions_answers on gcs===")
|
1186 |
-
gcs_client = GCS_CLIENT
|
1187 |
bucket_name = 'video_ai_assistant'
|
1188 |
file_name = f'{video_id}_questions_answers.json'
|
1189 |
blob_name = f"{video_id}/{file_name}"
|
1190 |
# 检查檔案是否存在
|
1191 |
is_questions_answers_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1192 |
if not is_questions_answers_exists:
|
1193 |
-
questions_answers = generate_questions_answers(df_string)
|
1194 |
questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
1195 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
|
1196 |
print("questions_answers已上傳到GCS")
|
@@ -1201,12 +1162,12 @@ def get_questions_answers(video_id, df_string, source="gcs"):
|
|
1201 |
questions_answers = json.loads(questions_answers_text)
|
1202 |
except Exception as e:
|
1203 |
print(f"Error getting questions_answers: {str(e)}")
|
1204 |
-
|
1205 |
-
questions_answers = [{"question": q, "answer": ""} for q in
|
1206 |
|
1207 |
return questions_answers
|
1208 |
|
1209 |
-
def generate_questions_answers(df_string):
|
1210 |
print("===generate_questions_answers===")
|
1211 |
segments = split_data(df_string, word_base=100000)
|
1212 |
all_content = []
|
@@ -1232,7 +1193,7 @@ def generate_questions_answers(df_string):
|
|
1232 |
}}
|
1233 |
"""
|
1234 |
response_format = { "type": "json_object" }
|
1235 |
-
content = generate_content_by_LLM(sys_content, user_content, response_format)
|
1236 |
content_json = json.loads(content)["questions_answers"]
|
1237 |
all_content += content_json
|
1238 |
|
@@ -1256,7 +1217,7 @@ def change_questions(password, df_string):
|
|
1256 |
print("=====get_questions=====")
|
1257 |
return q1, q2, q3
|
1258 |
|
1259 |
-
def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source):
|
1260 |
if source == "gcs":
|
1261 |
print("===get_key_moments on gcs===")
|
1262 |
gcs_client = GCS_CLIENT
|
@@ -1266,7 +1227,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1266 |
# 检查檔案是否存在
|
1267 |
is_key_moments_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1268 |
if not is_key_moments_exists:
|
1269 |
-
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
1270 |
key_moments_json = {"key_moments": key_moments}
|
1271 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1272 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
@@ -1282,7 +1243,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1282 |
for key_moment in key_moments_json["key_moments"]:
|
1283 |
if "keywords" not in key_moment:
|
1284 |
transcript = key_moment["transcript"]
|
1285 |
-
key_moment["keywords"] = generate_key_moments_keywords(transcript)
|
1286 |
print("===keywords===")
|
1287 |
print(key_moment["keywords"])
|
1288 |
print("===keywords===")
|
@@ -1303,7 +1264,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1303 |
# 检查檔案是否存在
|
1304 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
1305 |
if not exists:
|
1306 |
-
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
1307 |
key_moments_json = {"key_moments": key_moments}
|
1308 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1309 |
upload_content_directly(service, file_name, folder_id, key_moments_text)
|
@@ -1316,7 +1277,7 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1316 |
|
1317 |
return key_moments_json
|
1318 |
|
1319 |
-
def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
1320 |
print("===generate_key_moments===")
|
1321 |
segments = split_data(formatted_simple_transcript, word_base=100000)
|
1322 |
all_content = []
|
@@ -1343,7 +1304,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
|
1343 |
}}
|
1344 |
"""
|
1345 |
response_format = { "type": "json_object" }
|
1346 |
-
content = generate_content_by_LLM(sys_content, user_content, response_format)
|
1347 |
key_moments = json.loads(content)["key_moments"]
|
1348 |
|
1349 |
# "transcript": get text from formatted_simple_transcript
|
@@ -1371,7 +1332,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript):
|
|
1371 |
|
1372 |
return all_content
|
1373 |
|
1374 |
-
def generate_key_moments_keywords(transcript):
|
1375 |
print("===generate_key_moments_keywords===")
|
1376 |
segments = split_data(transcript, word_base=100000)
|
1377 |
all_content = []
|
@@ -1384,7 +1345,7 @@ def generate_key_moments_keywords(transcript):
|
|
1384 |
不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
|
1385 |
transcript:{segment}
|
1386 |
"""
|
1387 |
-
content = generate_content_by_LLM(sys_content, user_content)
|
1388 |
keywords = content.strip().split(",")
|
1389 |
all_content += keywords
|
1390 |
|
@@ -1665,7 +1626,6 @@ def delete_LLM_content(video_id, kind):
|
|
1665 |
|
1666 |
def update_LLM_content(video_id, new_content, kind):
|
1667 |
print(f"===upfdate kind on gcs===")
|
1668 |
-
gcs_client = GCS_CLIENT
|
1669 |
bucket_name = 'video_ai_assistant'
|
1670 |
file_name = f'{video_id}_{kind}.json'
|
1671 |
blob_name = f"{video_id}/{file_name}"
|
@@ -1739,16 +1699,16 @@ def update_LLM_content(video_id, new_content, kind):
|
|
1739 |
print(f"{kind} 已更新到GCS")
|
1740 |
return gr.update(value=updated_content, interactive=False)
|
1741 |
|
1742 |
-
def create_LLM_content(video_id, df_string, kind):
|
1743 |
print(f"===create_{kind}===")
|
1744 |
print(f"video_id: {video_id}")
|
1745 |
|
1746 |
if kind == "reading_passage_latex":
|
1747 |
-
content = generate_reading_passage(df_string)
|
1748 |
update_LLM_content(video_id, content, kind)
|
1749 |
elif kind == "summary_markdown":
|
1750 |
meta_data = get_meta_data(video_id)
|
1751 |
-
content = generate_summarise(df_string, meta_data)
|
1752 |
update_LLM_content(video_id, content, kind)
|
1753 |
elif kind == "mind_map":
|
1754 |
content = generate_mind_map(df_string)
|
@@ -1760,7 +1720,7 @@ def create_LLM_content(video_id, df_string, kind):
|
|
1760 |
transcript = df_string
|
1761 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
1762 |
formatted_transcript = create_formatted_transcript(video_id, transcript)
|
1763 |
-
gen_content = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
1764 |
update_LLM_content(video_id, gen_content, kind)
|
1765 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1766 |
elif kind == "transcript":
|
@@ -1768,7 +1728,7 @@ def create_LLM_content(video_id, df_string, kind):
|
|
1768 |
update_LLM_content(video_id, gen_content, kind)
|
1769 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1770 |
elif kind == "questions":
|
1771 |
-
gen_content = generate_questions(df_string)
|
1772 |
update_LLM_content(video_id, gen_content, kind)
|
1773 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1774 |
elif kind == "questions_answers":
|
@@ -1777,7 +1737,7 @@ def create_LLM_content(video_id, df_string, kind):
|
|
1777 |
else:
|
1778 |
transcript = df_string
|
1779 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
1780 |
-
gen_content = generate_questions_answers(formatted_simple_transcript)
|
1781 |
update_LLM_content(video_id, gen_content, kind)
|
1782 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1783 |
|
@@ -2690,14 +2650,20 @@ HEAD = """
|
|
2690 |
|
2691 |
with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, text_size = gr.themes.sizes.text_lg), head=HEAD) as demo:
|
2692 |
with gr.Row() as admin:
|
2693 |
-
|
2694 |
-
|
2695 |
-
|
2696 |
-
|
2697 |
-
|
2698 |
-
|
2699 |
-
|
2700 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
2701 |
with gr.Row() as data_state:
|
2702 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
2703 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
@@ -3170,7 +3136,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3170 |
)
|
3171 |
|
3172 |
# 当输入 YouTube 链接时触发
|
3173 |
-
process_youtube_link_inputs = [password, youtube_link]
|
3174 |
process_youtube_link_outputs = [
|
3175 |
video_id,
|
3176 |
questions_answers_json,
|
@@ -3251,7 +3217,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3251 |
{
|
3252 |
'button': transcript_create_button,
|
3253 |
'action': create_LLM_content,
|
3254 |
-
'inputs': [video_id, df_string_output, transcript_kind],
|
3255 |
'outputs': [df_string_output]
|
3256 |
},
|
3257 |
{
|
@@ -3282,7 +3248,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3282 |
{
|
3283 |
'button': reading_passage_create_button,
|
3284 |
'action': create_LLM_content,
|
3285 |
-
'inputs': [video_id, df_string_output, reading_passage_kind],
|
3286 |
'outputs': [reading_passage_text]
|
3287 |
},
|
3288 |
{
|
@@ -3313,7 +3279,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3313 |
{
|
3314 |
'button': summary_create_button,
|
3315 |
'action': create_LLM_content,
|
3316 |
-
'inputs': [video_id, df_string_output, summary_kind],
|
3317 |
'outputs': [summary_text]
|
3318 |
},
|
3319 |
{
|
@@ -3344,7 +3310,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3344 |
{
|
3345 |
'button': key_moments_create_button,
|
3346 |
'action': create_LLM_content,
|
3347 |
-
'inputs': [video_id, df_string_output, key_moments_kind],
|
3348 |
'outputs': [key_moments]
|
3349 |
},
|
3350 |
{
|
@@ -3375,7 +3341,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3375 |
{
|
3376 |
'button': questions_create_button,
|
3377 |
'action': create_LLM_content,
|
3378 |
-
'inputs': [video_id, df_string_output, questions_kind],
|
3379 |
'outputs': [questions_json]
|
3380 |
},
|
3381 |
{
|
@@ -3406,7 +3372,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3406 |
{
|
3407 |
'button': questions_answers_create_button,
|
3408 |
'action': create_LLM_content,
|
3409 |
-
'inputs': [video_id, df_string_output, questions_answers_kind],
|
3410 |
'outputs': [questions_answers_json]
|
3411 |
},
|
3412 |
{
|
@@ -3437,7 +3403,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
|
|
3437 |
{
|
3438 |
'button': worksheet_create_button,
|
3439 |
'action': create_LLM_content,
|
3440 |
-
'inputs': [video_id, df_string_output, worksheet_kind],
|
3441 |
'outputs': [worksheet_json]
|
3442 |
},
|
3443 |
{
|
|
|
503 |
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
504 |
print("Transcript uploaded successfully.")
|
505 |
|
506 |
+
def process_youtube_link(password, link, LLM_model=None):
|
507 |
verify_password(password)
|
508 |
video_id = extract_youtube_id(link)
|
509 |
|
|
|
545 |
|
546 |
# 基于逐字稿生成其他所需的输出
|
547 |
source = "gcs"
|
548 |
+
questions_answers = get_questions_answers(video_id, formatted_simple_transcript, source, LLM_model)
|
549 |
questions_answers_json = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
550 |
+
summary_json = get_video_id_summary(video_id, formatted_simple_transcript, source, LLM_model)
|
551 |
summary_text = summary_json["summary"]
|
552 |
summary = summary_json["summary"]
|
553 |
+
key_moments_json = get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source, LLM_model)
|
554 |
key_moments = key_moments_json["key_moments"]
|
555 |
key_moments_text = json.dumps(key_moments, ensure_ascii=False, indent=2)
|
556 |
key_moments_html = get_key_moments_html(key_moments)
|
557 |
html_content = format_transcript_to_html(formatted_transcript)
|
558 |
simple_html_content = format_simple_transcript_to_html(formatted_simple_transcript)
|
559 |
+
mind_map_json = get_mind_map(video_id, formatted_simple_transcript, source, LLM_model)
|
560 |
mind_map = mind_map_json["mind_map"]
|
561 |
mind_map_html = get_mind_map_html(mind_map)
|
562 |
+
reading_passage_json = get_reading_passage(video_id, formatted_simple_transcript, source, LLM_model)
|
563 |
reading_passage_text = reading_passage_json["reading_passage"]
|
564 |
reading_passage = reading_passage_json["reading_passage"]
|
565 |
meta_data = get_meta_data(video_id)
|
|
|
703 |
|
704 |
return segments
|
705 |
|
706 |
+
def generate_content_by_open_ai(sys_content, user_content, response_format=None):
|
707 |
+
print("LLM using OPEN AI")
|
708 |
+
model = "gpt-4-turbo"
|
709 |
+
messages = [
|
710 |
+
{"role": "system", "content": sys_content},
|
711 |
+
{"role": "user", "content": user_content}
|
712 |
+
]
|
713 |
+
request_payload = {
|
714 |
+
"model": model,
|
715 |
+
"messages": messages,
|
716 |
+
"max_tokens": 4000,
|
717 |
+
}
|
718 |
|
719 |
+
if response_format is not None:
|
720 |
+
request_payload["response_format"] = response_format
|
|
|
|
|
|
|
|
|
|
|
721 |
|
722 |
+
response = OPEN_AI_CLIENT.chat.completions.create(**request_payload)
|
723 |
+
content = response.choices[0].message.content.strip()
|
724 |
+
return content
|
|
|
|
|
|
|
725 |
|
726 |
+
def generate_content_by_bedrock(sys_content, user_content):
|
727 |
+
print("LLM using REDROCK")
|
728 |
+
messages = [
|
729 |
+
{"role": "user", "content": user_content +"(如果是 JSON 格式,value 的引號,請用單引號,或是用反斜線+雙引號,避免 JSON Decoder error )"}
|
730 |
+
]
|
731 |
+
model_id = "anthropic.claude-3-sonnet-20240229-v1:0"
|
732 |
+
# model_id = "anthropic.claude-3-haiku-20240307-v1:0"
|
733 |
+
kwargs = {
|
734 |
+
"modelId": model_id,
|
735 |
+
"contentType": "application/json",
|
736 |
+
"accept": "application/json",
|
737 |
+
"body": json.dumps({
|
738 |
+
"anthropic_version": "bedrock-2023-05-31",
|
739 |
+
"max_tokens": 4000,
|
740 |
+
"system": sys_content,
|
741 |
+
"messages": messages
|
742 |
+
})
|
743 |
+
}
|
744 |
+
response = BEDROCK_CLIENT.invoke_model(**kwargs)
|
745 |
+
response_body = json.loads(response.get('body').read())
|
746 |
+
content = response_body.get('content')[0].get('text')
|
747 |
+
return content
|
748 |
|
749 |
+
def generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=None):
|
750 |
+
# 使用 OpenAI 生成基于上传数据的问题
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
751 |
|
752 |
+
if LLM_model == "anthropic-claude-3-sonnet":
|
753 |
+
print(f"LLM: {LLM_model}")
|
754 |
+
content = generate_content_by_bedrock(sys_content, user_content)
|
755 |
+
else:
|
756 |
+
print(f"LLM: {LLM_model}")
|
757 |
+
content = generate_content_by_open_ai(sys_content, user_content, response_format)
|
758 |
+
|
759 |
print("=====content=====")
|
760 |
print(content)
|
761 |
print("=====content=====")
|
762 |
|
763 |
return content
|
764 |
|
765 |
+
def get_reading_passage(video_id, df_string, source, LLM_model=None):
|
766 |
if source == "gcs":
|
767 |
print("===get_reading_passage on gcs===")
|
|
|
768 |
bucket_name = 'video_ai_assistant'
|
769 |
file_name = f'{video_id}_reading_passage_latex.json'
|
770 |
blob_name = f"{video_id}/{file_name}"
|
771 |
# 检查 reading_passage 是否存在
|
772 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
773 |
if not is_file_exists:
|
774 |
+
reading_passage = generate_reading_passage(df_string, LLM_model)
|
775 |
reading_passage_json = {"reading_passage": str(reading_passage)}
|
776 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
777 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
|
|
|
804 |
|
805 |
return reading_passage_json
|
806 |
|
807 |
+
def generate_reading_passage(df_string, LLM_model=None):
|
808 |
print("===generate_reading_passage===")
|
809 |
segments = split_data(df_string, word_base=100000)
|
810 |
all_content = []
|
|
|
823 |
加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
|
824 |
請直接給出文章,不用介紹怎麼處理的或是文章字數等等
|
825 |
"""
|
826 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
827 |
all_content.append(content + "\n")
|
828 |
|
829 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
|
|
836 |
tts.save(filename)
|
837 |
return filename
|
838 |
|
839 |
+
def get_mind_map(video_id, df_string, source, LLM_model=None):
|
840 |
if source == "gcs":
|
841 |
print("===get_mind_map on gcs===")
|
842 |
gcs_client = GCS_CLIENT
|
|
|
846 |
# 检查檔案是否存在
|
847 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
848 |
if not is_file_exists:
|
849 |
+
mind_map = generate_mind_map(df_string, LLM_model)
|
850 |
mind_map_json = {"mind_map": str(mind_map)}
|
851 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
852 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
|
|
|
867 |
# 检查檔案是否存在
|
868 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
869 |
if not exists:
|
870 |
+
mind_map = generate_mind_map(df_string, LLM_model)
|
871 |
mind_map_json = {"mind_map": str(mind_map)}
|
872 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
873 |
upload_content_directly(service, file_name, folder_id, mind_map_text)
|
|
|
880 |
|
881 |
return mind_map_json
|
882 |
|
883 |
+
def generate_mind_map(df_string, LLM_model=None):
|
884 |
print("===generate_mind_map===")
|
885 |
segments = split_data(df_string, word_base=100000)
|
886 |
all_content = []
|
|
|
892 |
注意:不需要前後文敘述,直接給出 markdown 文本即可
|
893 |
這對我很重要
|
894 |
"""
|
895 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
896 |
all_content.append(content + "\n")
|
897 |
|
898 |
# 將所有生成的閱讀理解段落合併成一個完整的文章
|
|
|
911 |
"""
|
912 |
return mind_map_html
|
913 |
|
914 |
+
def get_video_id_summary(video_id, df_string, source, LLM_model=None):
|
915 |
if source == "gcs":
|
916 |
print("===get_video_id_summary on gcs===")
|
|
|
917 |
bucket_name = 'video_ai_assistant'
|
918 |
file_name = f'{video_id}_summary_markdown.json'
|
919 |
summary_file_blob_name = f"{video_id}/{file_name}"
|
|
|
921 |
is_summary_file_exists = GCS_SERVICE.check_file_exists(bucket_name, summary_file_blob_name)
|
922 |
if not is_summary_file_exists:
|
923 |
meta_data = get_meta_data(video_id)
|
924 |
+
summary = generate_summarise(df_string, meta_data, LLM_model)
|
925 |
summary_json = {"summary": str(summary)}
|
926 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
927 |
GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
|
|
|
943 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
944 |
if not exists:
|
945 |
meta_data = get_meta_data(video_id)
|
946 |
+
summary = generate_summarise(df_string, meta_data, LLM_model)
|
947 |
summary_json = {"summary": str(summary)}
|
948 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
949 |
|
|
|
964 |
|
965 |
return summary_json
|
966 |
|
967 |
+
def generate_summarise(df_string, metadata=None, LLM_model=None):
|
968 |
print("===generate_summarise===")
|
969 |
# 使用 OpenAI 生成基于上传数据的问题
|
970 |
if metadata:
|
|
|
1012 |
## ❓ 延伸小問題
|
1013 |
- (一個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
1014 |
"""
|
1015 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
1016 |
all_content.append(content + "\n")
|
1017 |
|
1018 |
if len(all_content) > 1:
|
|
|
1051 |
## ❓ 延伸小問題
|
1052 |
- ( {all_content_cnt} 個 bullet point....請圍繞「課程名稱」為學習重點,進行重點整理,不要整理跟情境故事相關的問題)
|
1053 |
"""
|
1054 |
+
final_content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
1055 |
else:
|
1056 |
final_content = all_content[0]
|
1057 |
|
1058 |
return final_content
|
1059 |
|
1060 |
+
def get_questions(video_id, df_string, source="gcs", LLM_model=None):
|
1061 |
if source == "gcs":
|
1062 |
# 去 gcs 確認是有有 video_id_questions.json
|
1063 |
print("===get_questions on gcs===")
|
|
|
1068 |
# 检查檔案是否存在
|
1069 |
is_questions_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1070 |
if not is_questions_exists:
|
1071 |
+
questions = generate_questions(df_string, LLM_model)
|
1072 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
1073 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
|
1074 |
print("questions已上傳到GCS")
|
|
|
1089 |
# 检查檔案是否存在
|
1090 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
1091 |
if not exists:
|
1092 |
+
questions = generate_questions(df_string, LLM_model)
|
1093 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
1094 |
upload_content_directly(service, file_name, folder_id, questions_text)
|
1095 |
print("questions已上傳到Google Drive")
|
|
|
1109 |
print("=====get_questions=====")
|
1110 |
return q1, q2, q3
|
1111 |
|
1112 |
+
def generate_questions(df_string, LLM_model=None):
|
1113 |
print("===generate_questions===")
|
1114 |
# 使用 OpenAI 生成基于上传数据的问题
|
1115 |
if isinstance(df_string, str):
|
|
|
1132 |
[q1的敘述text, q2的敘述text, q3的敘述text]
|
1133 |
}}
|
1134 |
"""
|
1135 |
+
response_format = { "type": "json_object" }
|
1136 |
+
questions = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
|
1137 |
+
questions_list = json.loads(questions)["questions"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1138 |
print("=====json_response=====")
|
1139 |
+
print(questions_list)
|
1140 |
print("=====json_response=====")
|
1141 |
|
1142 |
+
return questions_list
|
1143 |
|
1144 |
+
def get_questions_answers(video_id, df_string, source="gcs", LLM_model=None):
|
1145 |
if source == "gcs":
|
1146 |
try:
|
1147 |
print("===get_questions_answers on gcs===")
|
|
|
1148 |
bucket_name = 'video_ai_assistant'
|
1149 |
file_name = f'{video_id}_questions_answers.json'
|
1150 |
blob_name = f"{video_id}/{file_name}"
|
1151 |
# 检查檔案是否存在
|
1152 |
is_questions_answers_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1153 |
if not is_questions_answers_exists:
|
1154 |
+
questions_answers = generate_questions_answers(df_string, LLM_model)
|
1155 |
questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
1156 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
|
1157 |
print("questions_answers已上傳到GCS")
|
|
|
1162 |
questions_answers = json.loads(questions_answers_text)
|
1163 |
except Exception as e:
|
1164 |
print(f"Error getting questions_answers: {str(e)}")
|
1165 |
+
questions_list = get_questions(video_id, df_string, source, LLM_model)
|
1166 |
+
questions_answers = [{"question": q, "answer": ""} for q in questions_list]
|
1167 |
|
1168 |
return questions_answers
|
1169 |
|
1170 |
+
def generate_questions_answers(df_string, LLM_model=None):
|
1171 |
print("===generate_questions_answers===")
|
1172 |
segments = split_data(df_string, word_base=100000)
|
1173 |
all_content = []
|
|
|
1193 |
}}
|
1194 |
"""
|
1195 |
response_format = { "type": "json_object" }
|
1196 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
|
1197 |
content_json = json.loads(content)["questions_answers"]
|
1198 |
all_content += content_json
|
1199 |
|
|
|
1217 |
print("=====get_questions=====")
|
1218 |
return q1, q2, q3
|
1219 |
|
1220 |
+
def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript, source, LLM_model=None):
|
1221 |
if source == "gcs":
|
1222 |
print("===get_key_moments on gcs===")
|
1223 |
gcs_client = GCS_CLIENT
|
|
|
1227 |
# 检查檔案是否存在
|
1228 |
is_key_moments_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1229 |
if not is_key_moments_exists:
|
1230 |
+
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
|
1231 |
key_moments_json = {"key_moments": key_moments}
|
1232 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1233 |
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
|
|
1243 |
for key_moment in key_moments_json["key_moments"]:
|
1244 |
if "keywords" not in key_moment:
|
1245 |
transcript = key_moment["transcript"]
|
1246 |
+
key_moment["keywords"] = generate_key_moments_keywords(transcript, LLM_model)
|
1247 |
print("===keywords===")
|
1248 |
print(key_moment["keywords"])
|
1249 |
print("===keywords===")
|
|
|
1264 |
# 检查檔案是否存在
|
1265 |
exists, file_id = check_file_exists(service, folder_id, file_name)
|
1266 |
if not exists:
|
1267 |
+
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
|
1268 |
key_moments_json = {"key_moments": key_moments}
|
1269 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1270 |
upload_content_directly(service, file_name, folder_id, key_moments_text)
|
|
|
1277 |
|
1278 |
return key_moments_json
|
1279 |
|
1280 |
+
def generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model=None):
|
1281 |
print("===generate_key_moments===")
|
1282 |
segments = split_data(formatted_simple_transcript, word_base=100000)
|
1283 |
all_content = []
|
|
|
1304 |
}}
|
1305 |
"""
|
1306 |
response_format = { "type": "json_object" }
|
1307 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format, LLM_model)
|
1308 |
key_moments = json.loads(content)["key_moments"]
|
1309 |
|
1310 |
# "transcript": get text from formatted_simple_transcript
|
|
|
1332 |
|
1333 |
return all_content
|
1334 |
|
1335 |
+
def generate_key_moments_keywords(transcript, LLM_model=None):
|
1336 |
print("===generate_key_moments_keywords===")
|
1337 |
segments = split_data(transcript, word_base=100000)
|
1338 |
all_content = []
|
|
|
1345 |
不用給上下文,直接給出關鍵字,使用 zh-TW,用逗號分隔, example: 關鍵字1, 關鍵字2
|
1346 |
transcript:{segment}
|
1347 |
"""
|
1348 |
+
content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model)
|
1349 |
keywords = content.strip().split(",")
|
1350 |
all_content += keywords
|
1351 |
|
|
|
1626 |
|
1627 |
def update_LLM_content(video_id, new_content, kind):
|
1628 |
print(f"===upfdate kind on gcs===")
|
|
|
1629 |
bucket_name = 'video_ai_assistant'
|
1630 |
file_name = f'{video_id}_{kind}.json'
|
1631 |
blob_name = f"{video_id}/{file_name}"
|
|
|
1699 |
print(f"{kind} 已更新到GCS")
|
1700 |
return gr.update(value=updated_content, interactive=False)
|
1701 |
|
1702 |
+
def create_LLM_content(video_id, df_string, kind, LLM_model=None):
|
1703 |
print(f"===create_{kind}===")
|
1704 |
print(f"video_id: {video_id}")
|
1705 |
|
1706 |
if kind == "reading_passage_latex":
|
1707 |
+
content = generate_reading_passage(df_string, LLM_model)
|
1708 |
update_LLM_content(video_id, content, kind)
|
1709 |
elif kind == "summary_markdown":
|
1710 |
meta_data = get_meta_data(video_id)
|
1711 |
+
content = generate_summarise(df_string, meta_data, LLM_model)
|
1712 |
update_LLM_content(video_id, content, kind)
|
1713 |
elif kind == "mind_map":
|
1714 |
content = generate_mind_map(df_string)
|
|
|
1720 |
transcript = df_string
|
1721 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
1722 |
formatted_transcript = create_formatted_transcript(video_id, transcript)
|
1723 |
+
gen_content = generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_model)
|
1724 |
update_LLM_content(video_id, gen_content, kind)
|
1725 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1726 |
elif kind == "transcript":
|
|
|
1728 |
update_LLM_content(video_id, gen_content, kind)
|
1729 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1730 |
elif kind == "questions":
|
1731 |
+
gen_content = generate_questions(df_string, LLM_model)
|
1732 |
update_LLM_content(video_id, gen_content, kind)
|
1733 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1734 |
elif kind == "questions_answers":
|
|
|
1737 |
else:
|
1738 |
transcript = df_string
|
1739 |
formatted_simple_transcript = create_formatted_simple_transcript(transcript)
|
1740 |
+
gen_content = generate_questions_answers(formatted_simple_transcript, LLM_model)
|
1741 |
update_LLM_content(video_id, gen_content, kind)
|
1742 |
content = json.dumps(gen_content, ensure_ascii=False, indent=2)
|
1743 |
|
|
|
2650 |
|
2651 |
with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, text_size = gr.themes.sizes.text_lg), head=HEAD) as demo:
|
2652 |
with gr.Row() as admin:
|
2653 |
+
with gr.Column(scale=4):
|
2654 |
+
with gr.Row():
|
2655 |
+
password = gr.Textbox(label="Password", type="password", elem_id="password_input", visible=True)
|
2656 |
+
youtube_link = gr.Textbox(label="Enter YouTube Link", elem_id="youtube_link_input", visible=True)
|
2657 |
+
video_id = gr.Textbox(label="video_id", visible=True)
|
2658 |
+
# file_upload = gr.File(label="Upload your CSV or Word file", visible=False)
|
2659 |
+
# web_link = gr.Textbox(label="Enter Web Page Link", visible=False)
|
2660 |
+
user_data = gr.Textbox(label="User Data", elem_id="user_data_input", visible=True)
|
2661 |
+
with gr.Row():
|
2662 |
+
is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
|
2663 |
+
LLM_model = gr.Dropdown(label="LLM Model", choices=["open-ai-gpt-4", "anthropic-claude-3-sonnet"], value="open-ai-gpt-4", visible=True, interactive=True)
|
2664 |
+
with gr.Column(scale=1):
|
2665 |
+
with gr.Row():
|
2666 |
+
youtube_link_btn = gr.Button("Submit_YouTube_Link", elem_id="youtube_link_btn", visible=True)
|
2667 |
with gr.Row() as data_state:
|
2668 |
content_subject_state = gr.State() # 使用 gr.State 存储 content_subject
|
2669 |
content_grade_state = gr.State() # 使用 gr.State 存储 content_grade
|
|
|
3136 |
)
|
3137 |
|
3138 |
# 当输入 YouTube 链接时触发
|
3139 |
+
process_youtube_link_inputs = [password, youtube_link, LLM_model]
|
3140 |
process_youtube_link_outputs = [
|
3141 |
video_id,
|
3142 |
questions_answers_json,
|
|
|
3217 |
{
|
3218 |
'button': transcript_create_button,
|
3219 |
'action': create_LLM_content,
|
3220 |
+
'inputs': [video_id, df_string_output, transcript_kind, LLM_model],
|
3221 |
'outputs': [df_string_output]
|
3222 |
},
|
3223 |
{
|
|
|
3248 |
{
|
3249 |
'button': reading_passage_create_button,
|
3250 |
'action': create_LLM_content,
|
3251 |
+
'inputs': [video_id, df_string_output, reading_passage_kind, LLM_model],
|
3252 |
'outputs': [reading_passage_text]
|
3253 |
},
|
3254 |
{
|
|
|
3279 |
{
|
3280 |
'button': summary_create_button,
|
3281 |
'action': create_LLM_content,
|
3282 |
+
'inputs': [video_id, df_string_output, summary_kind, LLM_model],
|
3283 |
'outputs': [summary_text]
|
3284 |
},
|
3285 |
{
|
|
|
3310 |
{
|
3311 |
'button': key_moments_create_button,
|
3312 |
'action': create_LLM_content,
|
3313 |
+
'inputs': [video_id, df_string_output, key_moments_kind, LLM_model],
|
3314 |
'outputs': [key_moments]
|
3315 |
},
|
3316 |
{
|
|
|
3341 |
{
|
3342 |
'button': questions_create_button,
|
3343 |
'action': create_LLM_content,
|
3344 |
+
'inputs': [video_id, df_string_output, questions_kind, LLM_model],
|
3345 |
'outputs': [questions_json]
|
3346 |
},
|
3347 |
{
|
|
|
3372 |
{
|
3373 |
'button': questions_answers_create_button,
|
3374 |
'action': create_LLM_content,
|
3375 |
+
'inputs': [video_id, df_string_output, questions_answers_kind, LLM_model],
|
3376 |
'outputs': [questions_answers_json]
|
3377 |
},
|
3378 |
{
|
|
|
3403 |
{
|
3404 |
'button': worksheet_create_button,
|
3405 |
'action': create_LLM_content,
|
3406 |
+
'inputs': [video_id, df_string_output, worksheet_kind, LLM_model],
|
3407 |
'outputs': [worksheet_json]
|
3408 |
},
|
3409 |
{
|