Spaces:
Running
Running
GCS_SERVICE refactor
Browse files
app.py
CHANGED
@@ -93,87 +93,6 @@ def verify_password(password):
|
|
93 |
raise gr.Error("密碼錯誤")
|
94 |
|
95 |
# ====gcs====
|
96 |
-
def gcs_check_file_exists(gcs_client, bucket_name, file_name):
|
97 |
-
"""
|
98 |
-
检查 GCS 存储桶中是否存在指定的文件
|
99 |
-
file_name 格式:{folder_name}/{file_name}
|
100 |
-
"""
|
101 |
-
bucket = gcs_client.bucket(bucket_name)
|
102 |
-
blob = bucket.blob(file_name)
|
103 |
-
return blob.exists()
|
104 |
-
|
105 |
-
def upload_file_to_gcs(gcs_client, bucket_name, destination_blob_name, file_path):
|
106 |
-
"""上传文件到指定的 GCS 存储桶"""
|
107 |
-
bucket = gcs_client.bucket(bucket_name)
|
108 |
-
blob = bucket.blob(destination_blob_name)
|
109 |
-
blob.upload_from_filename(file_path)
|
110 |
-
print(f"File {file_path} uploaded to {destination_blob_name} in GCS.")
|
111 |
-
|
112 |
-
def upload_file_to_gcs_with_json_string(gcs_client, bucket_name, destination_blob_name, json_string):
|
113 |
-
"""上传字符串到指定的 GCS 存储桶"""
|
114 |
-
bucket = gcs_client.bucket(bucket_name)
|
115 |
-
blob = bucket.blob(destination_blob_name)
|
116 |
-
blob.upload_from_string(json_string)
|
117 |
-
print(f"JSON string uploaded to {destination_blob_name} in GCS.")
|
118 |
-
|
119 |
-
def download_blob_to_string(gcs_client, bucket_name, source_blob_name):
|
120 |
-
"""从 GCS 下载文件内容到字符串"""
|
121 |
-
bucket = gcs_client.bucket(bucket_name)
|
122 |
-
blob = bucket.blob(source_blob_name)
|
123 |
-
return blob.download_as_text()
|
124 |
-
|
125 |
-
def make_blob_public(gcs_client, bucket_name, blob_name):
|
126 |
-
"""将指定的 GCS 对象设置为公共可读"""
|
127 |
-
bucket = gcs_client.bucket(bucket_name)
|
128 |
-
blob = bucket.blob(blob_name)
|
129 |
-
blob.make_public()
|
130 |
-
print(f"Blob {blob_name} is now publicly accessible at {blob.public_url}")
|
131 |
-
|
132 |
-
def get_blob_public_url(gcs_client, bucket_name, blob_name):
|
133 |
-
"""获取指定 GCS 对象的公开 URL"""
|
134 |
-
bucket = gcs_client.bucket(bucket_name)
|
135 |
-
blob = bucket.blob(blob_name)
|
136 |
-
return blob.public_url
|
137 |
-
|
138 |
-
def upload_img_and_get_public_url(gcs_client, bucket_name, file_name, file_path):
|
139 |
-
"""上传图片到 GCS 并获取其公开 URL"""
|
140 |
-
# 上传图片
|
141 |
-
upload_file_to_gcs(gcs_client, bucket_name, file_name, file_path)
|
142 |
-
# 将上传的图片设置为公开
|
143 |
-
make_blob_public(gcs_client, bucket_name, file_name)
|
144 |
-
# 获取图片的公开 URL
|
145 |
-
public_url = get_blob_public_url(gcs_client, bucket_name, file_name)
|
146 |
-
print(f"Public URL for the uploaded image: {public_url}")
|
147 |
-
return public_url
|
148 |
-
|
149 |
-
def copy_all_files_from_drive_to_gcs(drive_service, gcs_client, drive_folder_id, bucket_name, gcs_folder_name):
|
150 |
-
# Get all files from the folder
|
151 |
-
query = f"'{drive_folder_id}' in parents and trashed = false"
|
152 |
-
response = drive_service.files().list(q=query).execute()
|
153 |
-
files = response.get('files', [])
|
154 |
-
for file in files:
|
155 |
-
# Copy each file to GCS
|
156 |
-
file_id = file['id']
|
157 |
-
file_name = file['name']
|
158 |
-
gcs_destination_path = f"{gcs_folder_name}/{file_name}"
|
159 |
-
copy_file_from_drive_to_gcs(drive_service, gcs_client, file_id, bucket_name, gcs_destination_path)
|
160 |
-
|
161 |
-
def copy_file_from_drive_to_gcs(drive_service, gcs_client, file_id, bucket_name, gcs_destination_path):
|
162 |
-
# Download file content from Drive
|
163 |
-
request = drive_service.files().get_media(fileId=file_id)
|
164 |
-
fh = io.BytesIO()
|
165 |
-
downloader = MediaIoBaseDownload(fh, request)
|
166 |
-
done = False
|
167 |
-
while not done:
|
168 |
-
status, done = downloader.next_chunk()
|
169 |
-
fh.seek(0)
|
170 |
-
file_content = fh.getvalue()
|
171 |
-
|
172 |
-
# Upload file content to GCS
|
173 |
-
bucket = gcs_client.bucket(bucket_name)
|
174 |
-
blob = bucket.blob(gcs_destination_path)
|
175 |
-
blob.upload_from_string(file_content)
|
176 |
-
print(f"File {file_id} copied to GCS at {gcs_destination_path}.")
|
177 |
|
178 |
def delete_blob(gcs_client, bucket_name, blob_name):
|
179 |
"""删除指定的 GCS 对象"""
|
@@ -483,12 +402,13 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
483 |
transcript = generate_transcription_by_whisper(video_id)
|
484 |
|
485 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
486 |
-
|
|
|
487 |
is_new_transcript = True
|
488 |
else:
|
489 |
# 逐字稿已存在,下载逐字稿内容
|
490 |
print("逐字稿已存在于GCS中")
|
491 |
-
transcript_text =
|
492 |
transcript = json.loads(transcript_text)
|
493 |
|
494 |
# print("===確認其他衍生文件===")
|
@@ -517,7 +437,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
517 |
# 截图
|
518 |
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
519 |
screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
|
520 |
-
img_file_id =
|
521 |
entry['img_file_id'] = img_file_id
|
522 |
print(f"截图已上传到GCS: {img_file_id}")
|
523 |
is_new_transcript = True
|
@@ -529,7 +449,7 @@ def process_transcript_and_screenshots_on_gcs(video_id):
|
|
529 |
print(transcript)
|
530 |
print("===更新逐字稿文件===")
|
531 |
updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
532 |
-
|
533 |
print("逐字稿已更新,包括截图链接")
|
534 |
updated_transcript_json = json.loads(updated_transcript_text)
|
535 |
else:
|
@@ -723,12 +643,12 @@ def get_reading_passage(video_id, df_string, source):
|
|
723 |
reading_passage = generate_reading_passage(df_string)
|
724 |
reading_passage_json = {"reading_passage": str(reading_passage)}
|
725 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
726 |
-
|
727 |
print("reading_passage已上传到GCS")
|
728 |
else:
|
729 |
# reading_passage已存在,下载内容
|
730 |
print("reading_passage已存在于GCS中")
|
731 |
-
reading_passage_text =
|
732 |
reading_passage_json = json.loads(reading_passage_text)
|
733 |
|
734 |
elif source == "drive":
|
@@ -805,12 +725,12 @@ def get_mind_map(video_id, df_string, source):
|
|
805 |
mind_map = generate_mind_map(df_string)
|
806 |
mind_map_json = {"mind_map": str(mind_map)}
|
807 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
808 |
-
|
809 |
print("mind_map已上傳到GCS")
|
810 |
else:
|
811 |
# mindmap已存在,下载内容
|
812 |
print("mind_map已存在于GCS中")
|
813 |
-
mind_map_text =
|
814 |
mind_map_json = json.loads(mind_map_text)
|
815 |
|
816 |
elif source == "drive":
|
@@ -889,12 +809,12 @@ def get_video_id_summary(video_id, df_string, source):
|
|
889 |
summary = generate_summarise(df_string, meta_data)
|
890 |
summary_json = {"summary": str(summary)}
|
891 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
892 |
-
|
893 |
print("summary已上传到GCS")
|
894 |
else:
|
895 |
# summary已存在,下载内容
|
896 |
print("summary已存在于GCS中")
|
897 |
-
summary_text =
|
898 |
summary_json = json.loads(summary_text)
|
899 |
|
900 |
elif source == "drive":
|
@@ -1012,12 +932,12 @@ def get_questions(video_id, df_string, source="gcs"):
|
|
1012 |
if not is_questions_exists:
|
1013 |
questions = generate_questions(df_string)
|
1014 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
1015 |
-
|
1016 |
print("questions已上傳到GCS")
|
1017 |
else:
|
1018 |
# 逐字稿已存在,下载逐字稿内容
|
1019 |
print("questions已存在于GCS中")
|
1020 |
-
questions_text =
|
1021 |
questions = json.loads(questions_text)
|
1022 |
|
1023 |
elif source == "drive":
|
@@ -1103,12 +1023,12 @@ def get_questions_answers(video_id, df_string, source="gcs"):
|
|
1103 |
if not is_questions_answers_exists:
|
1104 |
questions_answers = generate_questions_answers(df_string)
|
1105 |
questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
1106 |
-
|
1107 |
print("questions_answers已上傳到GCS")
|
1108 |
else:
|
1109 |
# questions_answers已存在,下载内容
|
1110 |
print("questions_answers已存在于GCS中")
|
1111 |
-
questions_answers_text =
|
1112 |
questions_answers = json.loads(questions_answers_text)
|
1113 |
except:
|
1114 |
questions = get_questions(video_id, df_string, source)
|
@@ -1202,12 +1122,12 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1202 |
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
1203 |
key_moments_json = {"key_moments": key_moments}
|
1204 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1205 |
-
|
1206 |
print("key_moments已上傳到GCS")
|
1207 |
else:
|
1208 |
# key_moments已存在,下载内容
|
1209 |
print("key_moments已存在于GCS中")
|
1210 |
-
key_moments_text =
|
1211 |
key_moments_json = json.loads(key_moments_text)
|
1212 |
# 檢查 key_moments 是否有 keywords
|
1213 |
print("===檢查 key_moments 是否有 keywords===")
|
@@ -1222,8 +1142,8 @@ def get_key_moments(video_id, formatted_simple_transcript, formatted_transcript,
|
|
1222 |
has_keywords_added = True
|
1223 |
if has_keywords_added:
|
1224 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1225 |
-
|
1226 |
-
key_moments_text =
|
1227 |
key_moments_json = json.loads(key_moments_text)
|
1228 |
|
1229 |
elif source == "drive":
|
@@ -1545,7 +1465,7 @@ def get_LLM_content(video_id, kind):
|
|
1545 |
# 检查 file 是否存在
|
1546 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1547 |
if is_file_exists:
|
1548 |
-
content =
|
1549 |
content_json = json.loads(content)
|
1550 |
if kind == "reading_passage_latex":
|
1551 |
content_text = content_json["reading_passage"]
|
@@ -1569,7 +1489,7 @@ def delete_LLM_content(video_id, kind):
|
|
1569 |
# 检查 file 是否存在
|
1570 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1571 |
if is_file_exists:
|
1572 |
-
delete_blob(
|
1573 |
print(f"{file_name}已从GCS中删除")
|
1574 |
return gr.update(value="", interactive=False)
|
1575 |
|
@@ -1585,17 +1505,17 @@ def update_LLM_content(video_id, new_content, kind):
|
|
1585 |
print(new_content)
|
1586 |
reading_passage_json = {"reading_passage": str(new_content)}
|
1587 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
1588 |
-
|
1589 |
updated_content = new_content
|
1590 |
elif kind == "summary_markdown":
|
1591 |
summary_json = {"summary": str(new_content)}
|
1592 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
1593 |
-
|
1594 |
updated_content = new_content
|
1595 |
elif kind == "mind_map":
|
1596 |
mind_map_json = {"mind_map": str(new_content)}
|
1597 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
1598 |
-
|
1599 |
updated_content = mind_map_text
|
1600 |
elif kind == "key_moments":
|
1601 |
# from update_LLM_btn -> new_content is a string
|
@@ -1606,7 +1526,7 @@ def update_LLM_content(video_id, new_content, kind):
|
|
1606 |
key_moments_list = new_content
|
1607 |
key_moments_json = {"key_moments": key_moments_list}
|
1608 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1609 |
-
|
1610 |
updated_content = key_moments_text
|
1611 |
elif kind == "transcript":
|
1612 |
if isinstance(new_content, str):
|
@@ -1614,7 +1534,7 @@ def update_LLM_content(video_id, new_content, kind):
|
|
1614 |
else:
|
1615 |
transcript_json = new_content
|
1616 |
transcript_text = json.dumps(transcript_json, ensure_ascii=False, indent=2)
|
1617 |
-
|
1618 |
updated_content = transcript_text
|
1619 |
elif kind == "questions":
|
1620 |
# from update_LLM_btn -> new_content is a string
|
@@ -1624,7 +1544,7 @@ def update_LLM_content(video_id, new_content, kind):
|
|
1624 |
else:
|
1625 |
questions_json = new_content
|
1626 |
questions_text = json.dumps(questions_json, ensure_ascii=False, indent=2)
|
1627 |
-
|
1628 |
updated_content = questions_text
|
1629 |
elif kind == "questions_answers":
|
1630 |
# from update_LLM_btn -> new_content is a string
|
@@ -1634,7 +1554,7 @@ def update_LLM_content(video_id, new_content, kind):
|
|
1634 |
else:
|
1635 |
questions_answers_json = new_content
|
1636 |
questions_answers_text = json.dumps(questions_answers_json, ensure_ascii=False, indent=2)
|
1637 |
-
|
1638 |
updated_content = questions_answers_text
|
1639 |
|
1640 |
print(f"{kind} 已更新到GCS")
|
@@ -1701,7 +1621,7 @@ def reading_passage_add_latex_version(video_id):
|
|
1701 |
|
1702 |
# 逐字稿已存在,下载逐字稿内容
|
1703 |
print("reading_passage 已存在于GCS中,轉換 Latex 模式")
|
1704 |
-
reading_passage_text =
|
1705 |
reading_passage_json = json.loads(reading_passage_text)
|
1706 |
original_reading_passage = reading_passage_json["reading_passage"]
|
1707 |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
@@ -1734,7 +1654,7 @@ def reading_passage_add_latex_version(video_id):
|
|
1734 |
# 另存為 reading_passage_latex.json
|
1735 |
new_file_name = f'{video_id}_reading_passage_latex.json'
|
1736 |
new_blob_name = f"{video_id}/{new_file_name}"
|
1737 |
-
|
1738 |
|
1739 |
return new_reading_passage
|
1740 |
|
@@ -1754,7 +1674,7 @@ def summary_add_markdown_version(video_id):
|
|
1754 |
|
1755 |
# 逐字稿已存在,下载逐字稿内容
|
1756 |
print("summary 已存在于GCS中,轉換 Markdown 模式")
|
1757 |
-
summary_text =
|
1758 |
summary_json = json.loads(summary_text)
|
1759 |
original_summary = summary_json["summary"]
|
1760 |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
@@ -1803,7 +1723,7 @@ def summary_add_markdown_version(video_id):
|
|
1803 |
# 另存為 summary_markdown.json
|
1804 |
new_file_name = f'{video_id}_summary_markdown.json'
|
1805 |
new_blob_name = f"{video_id}/{new_file_name}"
|
1806 |
-
|
1807 |
|
1808 |
return new_summary
|
1809 |
|
@@ -1827,7 +1747,7 @@ def get_meta_data(video_id, source="gcs"):
|
|
1827 |
else:
|
1828 |
# meta_data已存在,下载内容
|
1829 |
print("meta_data已存在于GCS中")
|
1830 |
-
meta_data_text =
|
1831 |
meta_data_json = json.loads(meta_data_text)
|
1832 |
|
1833 |
# meta_data_json grade 數字轉換成文字
|
@@ -1865,11 +1785,11 @@ def get_ai_content(password, video_id, df_string, topic, grade, level, specific_
|
|
1865 |
# 先建立一個 ai_content_list.json
|
1866 |
ai_content_list = []
|
1867 |
ai_content_text = json.dumps(ai_content_list, ensure_ascii=False, indent=2)
|
1868 |
-
|
1869 |
print("ai_content_list [] 已上傳到GCS")
|
1870 |
|
1871 |
# 此時 ai_content_list 已存在
|
1872 |
-
ai_content_list_string =
|
1873 |
ai_content_list = json.loads(ai_content_list_string)
|
1874 |
# by key 找到 ai_content (topic, grade, level, specific_feature, content_type)
|
1875 |
target_kvs = {
|
@@ -1896,7 +1816,7 @@ def get_ai_content(password, video_id, df_string, topic, grade, level, specific_
|
|
1896 |
|
1897 |
ai_content_list.append(ai_content_json)
|
1898 |
ai_content_text = json.dumps(ai_content_list, ensure_ascii=False, indent=2)
|
1899 |
-
|
1900 |
print("ai_content已上傳到GCS")
|
1901 |
else:
|
1902 |
ai_content_json = ai_content_json[-1]
|
|
|
93 |
raise gr.Error("密碼錯誤")
|
94 |
|
95 |
# ====gcs====
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
def delete_blob(gcs_client, bucket_name, blob_name):
|
98 |
"""删除指定的 GCS 对象"""
|
|
|
402 |
transcript = generate_transcription_by_whisper(video_id)
|
403 |
|
404 |
transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
405 |
+
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, transcript_text)
|
406 |
+
|
407 |
is_new_transcript = True
|
408 |
else:
|
409 |
# 逐字稿已存在,下载逐字稿内容
|
410 |
print("逐字稿已存在于GCS中")
|
411 |
+
transcript_text = GCS_SERVICE.download_as_string(bucket_name, transcript_blob_name)
|
412 |
transcript = json.loads(transcript_text)
|
413 |
|
414 |
# print("===確認其他衍生文件===")
|
|
|
437 |
# 截图
|
438 |
screenshot_path = screenshot_youtube_video(video_id, entry['start'])
|
439 |
screenshot_blob_name = f"{video_id}/{video_id}_{entry['start']}.jpg"
|
440 |
+
img_file_id = GCS_SERVICE.upload_image_and_get_public_url(bucket_name, screenshot_blob_name, screenshot_path)
|
441 |
entry['img_file_id'] = img_file_id
|
442 |
print(f"截图已上传到GCS: {img_file_id}")
|
443 |
is_new_transcript = True
|
|
|
449 |
print(transcript)
|
450 |
print("===更新逐字稿文件===")
|
451 |
updated_transcript_text = json.dumps(transcript, ensure_ascii=False, indent=2)
|
452 |
+
GCS_SERVICE.upload_json_string(bucket_name, transcript_blob_name, updated_transcript_text)
|
453 |
print("逐字稿已更新,包括截图链接")
|
454 |
updated_transcript_json = json.loads(updated_transcript_text)
|
455 |
else:
|
|
|
643 |
reading_passage = generate_reading_passage(df_string)
|
644 |
reading_passage_json = {"reading_passage": str(reading_passage)}
|
645 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
646 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
|
647 |
print("reading_passage已上传到GCS")
|
648 |
else:
|
649 |
# reading_passage已存在,下载内容
|
650 |
print("reading_passage已存在于GCS中")
|
651 |
+
reading_passage_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
652 |
reading_passage_json = json.loads(reading_passage_text)
|
653 |
|
654 |
elif source == "drive":
|
|
|
725 |
mind_map = generate_mind_map(df_string)
|
726 |
mind_map_json = {"mind_map": str(mind_map)}
|
727 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
728 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
|
729 |
print("mind_map已上傳到GCS")
|
730 |
else:
|
731 |
# mindmap已存在,下载内容
|
732 |
print("mind_map已存在于GCS中")
|
733 |
+
mind_map_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
734 |
mind_map_json = json.loads(mind_map_text)
|
735 |
|
736 |
elif source == "drive":
|
|
|
809 |
summary = generate_summarise(df_string, meta_data)
|
810 |
summary_json = {"summary": str(summary)}
|
811 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
812 |
+
GCS_SERVICE.upload_json_string(bucket_name, summary_file_blob_name, summary_text)
|
813 |
print("summary已上传到GCS")
|
814 |
else:
|
815 |
# summary已存在,下载内容
|
816 |
print("summary已存在于GCS中")
|
817 |
+
summary_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
818 |
summary_json = json.loads(summary_text)
|
819 |
|
820 |
elif source == "drive":
|
|
|
932 |
if not is_questions_exists:
|
933 |
questions = generate_questions(df_string)
|
934 |
questions_text = json.dumps(questions, ensure_ascii=False, indent=2)
|
935 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
|
936 |
print("questions已上傳到GCS")
|
937 |
else:
|
938 |
# 逐字稿已存在,下载逐字稿内容
|
939 |
print("questions已存在于GCS中")
|
940 |
+
questions_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
941 |
questions = json.loads(questions_text)
|
942 |
|
943 |
elif source == "drive":
|
|
|
1023 |
if not is_questions_answers_exists:
|
1024 |
questions_answers = generate_questions_answers(df_string)
|
1025 |
questions_answers_text = json.dumps(questions_answers, ensure_ascii=False, indent=2)
|
1026 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
|
1027 |
print("questions_answers已上傳到GCS")
|
1028 |
else:
|
1029 |
# questions_answers已存在,下载内容
|
1030 |
print("questions_answers已存在于GCS中")
|
1031 |
+
questions_answers_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
1032 |
questions_answers = json.loads(questions_answers_text)
|
1033 |
except:
|
1034 |
questions = get_questions(video_id, df_string, source)
|
|
|
1122 |
key_moments = generate_key_moments(formatted_simple_transcript, formatted_transcript)
|
1123 |
key_moments_json = {"key_moments": key_moments}
|
1124 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1125 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
1126 |
print("key_moments已上傳到GCS")
|
1127 |
else:
|
1128 |
# key_moments已存在,下载内容
|
1129 |
print("key_moments已存在于GCS中")
|
1130 |
+
key_moments_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
1131 |
key_moments_json = json.loads(key_moments_text)
|
1132 |
# 檢查 key_moments 是否有 keywords
|
1133 |
print("===檢查 key_moments 是否有 keywords===")
|
|
|
1142 |
has_keywords_added = True
|
1143 |
if has_keywords_added:
|
1144 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1145 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
1146 |
+
key_moments_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
1147 |
key_moments_json = json.loads(key_moments_text)
|
1148 |
|
1149 |
elif source == "drive":
|
|
|
1465 |
# 检查 file 是否存在
|
1466 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1467 |
if is_file_exists:
|
1468 |
+
content = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
1469 |
content_json = json.loads(content)
|
1470 |
if kind == "reading_passage_latex":
|
1471 |
content_text = content_json["reading_passage"]
|
|
|
1489 |
# 检查 file 是否存在
|
1490 |
is_file_exists = GCS_SERVICE.check_file_exists(bucket_name, blob_name)
|
1491 |
if is_file_exists:
|
1492 |
+
GCS_SERVICE.delete_blob(bucket_name, blob_name)
|
1493 |
print(f"{file_name}已从GCS中删除")
|
1494 |
return gr.update(value="", interactive=False)
|
1495 |
|
|
|
1505 |
print(new_content)
|
1506 |
reading_passage_json = {"reading_passage": str(new_content)}
|
1507 |
reading_passage_text = json.dumps(reading_passage_json, ensure_ascii=False, indent=2)
|
1508 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, reading_passage_text)
|
1509 |
updated_content = new_content
|
1510 |
elif kind == "summary_markdown":
|
1511 |
summary_json = {"summary": str(new_content)}
|
1512 |
summary_text = json.dumps(summary_json, ensure_ascii=False, indent=2)
|
1513 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, summary_text)
|
1514 |
updated_content = new_content
|
1515 |
elif kind == "mind_map":
|
1516 |
mind_map_json = {"mind_map": str(new_content)}
|
1517 |
mind_map_text = json.dumps(mind_map_json, ensure_ascii=False, indent=2)
|
1518 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, mind_map_text)
|
1519 |
updated_content = mind_map_text
|
1520 |
elif kind == "key_moments":
|
1521 |
# from update_LLM_btn -> new_content is a string
|
|
|
1526 |
key_moments_list = new_content
|
1527 |
key_moments_json = {"key_moments": key_moments_list}
|
1528 |
key_moments_text = json.dumps(key_moments_json, ensure_ascii=False, indent=2)
|
1529 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, key_moments_text)
|
1530 |
updated_content = key_moments_text
|
1531 |
elif kind == "transcript":
|
1532 |
if isinstance(new_content, str):
|
|
|
1534 |
else:
|
1535 |
transcript_json = new_content
|
1536 |
transcript_text = json.dumps(transcript_json, ensure_ascii=False, indent=2)
|
1537 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, transcript_text)
|
1538 |
updated_content = transcript_text
|
1539 |
elif kind == "questions":
|
1540 |
# from update_LLM_btn -> new_content is a string
|
|
|
1544 |
else:
|
1545 |
questions_json = new_content
|
1546 |
questions_text = json.dumps(questions_json, ensure_ascii=False, indent=2)
|
1547 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_text)
|
1548 |
updated_content = questions_text
|
1549 |
elif kind == "questions_answers":
|
1550 |
# from update_LLM_btn -> new_content is a string
|
|
|
1554 |
else:
|
1555 |
questions_answers_json = new_content
|
1556 |
questions_answers_text = json.dumps(questions_answers_json, ensure_ascii=False, indent=2)
|
1557 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, questions_answers_text)
|
1558 |
updated_content = questions_answers_text
|
1559 |
|
1560 |
print(f"{kind} 已更新到GCS")
|
|
|
1621 |
|
1622 |
# 逐字稿已存在,下载逐字稿内容
|
1623 |
print("reading_passage 已存在于GCS中,轉換 Latex 模式")
|
1624 |
+
reading_passage_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
1625 |
reading_passage_json = json.loads(reading_passage_text)
|
1626 |
original_reading_passage = reading_passage_json["reading_passage"]
|
1627 |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
|
|
1654 |
# 另存為 reading_passage_latex.json
|
1655 |
new_file_name = f'{video_id}_reading_passage_latex.json'
|
1656 |
new_blob_name = f"{video_id}/{new_file_name}"
|
1657 |
+
GCS_SERVICE.upload_json_string(bucket_name, new_blob_name, reading_passage_text)
|
1658 |
|
1659 |
return new_reading_passage
|
1660 |
|
|
|
1674 |
|
1675 |
# 逐字稿已存在,下载逐字稿内容
|
1676 |
print("summary 已存在于GCS中,轉換 Markdown 模式")
|
1677 |
+
summary_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
1678 |
summary_json = json.loads(summary_text)
|
1679 |
original_summary = summary_json["summary"]
|
1680 |
sys_content = "你是一個擅長資料分析跟影片教學的老師,user 為學生,請精讀資料文本,自行判斷資料的種類,使用 zh-TW"
|
|
|
1723 |
# 另存為 summary_markdown.json
|
1724 |
new_file_name = f'{video_id}_summary_markdown.json'
|
1725 |
new_blob_name = f"{video_id}/{new_file_name}"
|
1726 |
+
GCS_SERVICE.upload_json_string(bucket_name, new_blob_name, summary_text)
|
1727 |
|
1728 |
return new_summary
|
1729 |
|
|
|
1747 |
else:
|
1748 |
# meta_data已存在,下载内容
|
1749 |
print("meta_data已存在于GCS中")
|
1750 |
+
meta_data_text = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
1751 |
meta_data_json = json.loads(meta_data_text)
|
1752 |
|
1753 |
# meta_data_json grade 數字轉換成文字
|
|
|
1785 |
# 先建立一個 ai_content_list.json
|
1786 |
ai_content_list = []
|
1787 |
ai_content_text = json.dumps(ai_content_list, ensure_ascii=False, indent=2)
|
1788 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, ai_content_text)
|
1789 |
print("ai_content_list [] 已上傳到GCS")
|
1790 |
|
1791 |
# 此時 ai_content_list 已存在
|
1792 |
+
ai_content_list_string = GCS_SERVICE.download_as_string(bucket_name, blob_name)
|
1793 |
ai_content_list = json.loads(ai_content_list_string)
|
1794 |
# by key 找到 ai_content (topic, grade, level, specific_feature, content_type)
|
1795 |
target_kvs = {
|
|
|
1816 |
|
1817 |
ai_content_list.append(ai_content_json)
|
1818 |
ai_content_text = json.dumps(ai_content_list, ensure_ascii=False, indent=2)
|
1819 |
+
GCS_SERVICE.upload_json_string(bucket_name, blob_name, ai_content_text)
|
1820 |
print("ai_content已上傳到GCS")
|
1821 |
else:
|
1822 |
ai_content_json = ai_content_json[-1]
|