Spaces:

JunyiAcademy
/

vaitor2

Running

App Files Files Community

youngtsai commited on Jun 17, 2024

Commit

b96d724

1 Parent(s): d4bec48

update

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +64 -28
requirements.txt +4 -2

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 📚
 colorFrom: red
 colorTo: blue
 sdk: gradio
-sdk_version: 4.8.0
 app_file: app.py
 pinned: false
 ---

 colorFrom: red
 colorTo: blue
 sdk: gradio
+sdk_version: 4.36.0
 app_file: app.py
 pinned: false
 ---

app.py CHANGED Viewed

@@ -34,6 +34,10 @@ from googleapiclient.http import MediaIoBaseUpload
 from educational_material import EducationalMaterial
 from storage_service import GoogleCloudStorage
 import boto3
@@ -92,6 +96,19 @@ TRANSCRIPTS = []
 CURRENT_INDEX = 0
 CHAT_LIMIT = 5
 # CLIENTS CONFIG
 GBQ_CLIENT = bigquery.Client.from_service_account_info(json.loads(GBQ_KEY))
 GROQ_CLIENT = Groq(api_key=GROQ_API_KEY)
@@ -716,18 +733,17 @@ def split_data(df_string, word_base=100000):
         start_idx = i * part_size
         end_idx = min((i + 1) * part_size, len(data))
         # Serialize the segment back to a JSON string
-        segment = json.dumps(data[start_idx:end_idx])
         segments.append(segment)
     return segments
 def generate_content_by_open_ai(sys_content, user_content, response_format=None, model_name=None):
-    print("LLM using OPEN AI")
     if model_name == "gpt-4-turbo":
         model = "gpt-4-turbo"
     else:
         model = "gpt-4o"
-    print(f"model: {model}")
     messages = [
         {"role": "system", "content": sys_content},
@@ -770,16 +786,29 @@ def generate_content_by_open_ai(sys_content, user_content, response_format=None,
 #     content = response_body.get('content')[0].get('text')
 #     return content
 def generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=None, model_name=None):
     # 使用 OpenAI 生成基于上传数据的问题
-    # if LLM_model == "anthropic-claude-3-sonnet":
     #     print(f"LLM: {LLM_model}")
     #     content = generate_content_by_bedrock(sys_content, user_content)
-    # else:
-    print(f"LLM: {LLM_model}")
-    print(f"model_name: {model_name}")
-    content = generate_content_by_open_ai(sys_content, user_content, response_format, model_name=model_name)
     print("=====content=====")
     print(content)
@@ -830,30 +859,36 @@ def get_reading_passage(video_id, df_string, source, LLM_model=None):
     return reading_passage_json
 def generate_reading_passage(df_string, LLM_model=None):
-    print("===generate_reading_passage===")
     segments = split_data(df_string, word_base=100000)
     all_content = []
     model_name = "gpt-4-turbo"
     # model_name = "gpt-4o"
     for segment in segments:
-        sys_content = "你是一個擅長資料分析跟影片教學的老師，user 為學生，請精讀資料文本，自行判斷資料的種類，使用 zh-TW"
         user_content = f"""
-          # 文本 {segment}
-          # rules:
-          - 根據文本，抓取重點
-          - 去除人類講課時口語的問答句，重新拆解成文章，建立適合閱讀語句通順的 Reading Passage
-          - 只需要專注提供 Reading Passage，字數在 500 字以內
-          - 敘述中，請把數學或是專業術語，用 Latex 包覆（$...$）
-          - 加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
-          # restrictions:
-          - 請一定要使用繁體中文 zh-TW，這很重要
-          - 產生的結果不要前後文解釋，也不要敘述這篇文章怎麼產生的
-          - 請直接給出文章，不用介紹怎麼處理的或是文章字數等等
-          - 字數在 500 字以內
         """
         content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model, model_name=model_name)
         all_content.append(content + "\n")
@@ -1330,7 +1365,7 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_
             # 文本：{segment}
             # Rule
-            1. 請根據文本，提取出 5 段重點摘要，並給出對應的時間軸，每一段重點的時間軸範圍大於1分鐘，但小於 1/3 總逐字稿長度
             2. 內容當中，如果有列舉方法、模式或是工具，就用 bulletpoint 或是 編號方式 列出，並在列舉部分的頭尾用[]匡列（example: FAANG 是以下五間公司： [1. A公司 2.B公司 3.C公司 4.D公司 5.E公司 ]，...）
             3. 注意不要遺漏任何一段時間軸的內容 從零秒開始，以這種方式分析整個文本，從零秒開始分析，直到結束。這很重要
             4. 結尾的時間如果有總結性的話，也要擷取
@@ -1342,11 +1377,12 @@ def generate_key_moments(formatted_simple_transcript, formatted_transcript, LLM_
             1. 請一定要用 zh-TW，這非常重要！
             2. 如果是疑似主播、主持人的圖片場景，且沒有任何有用的資訊，請不要選取
             3. 如果頭尾的情節不是重點，特別是打招呼或是介紹自己是誰、或是finally say goodbye 就是不重要的情節，就不用擷取
             Example: retrun JSON
             {{key_moments:[{{
                 "start": "00:00",
-                "end": "01:00",
                 "text": "逐字稿的重點摘要",
                 "keywords": ["關鍵字", "關鍵字"]
                 }}]
@@ -3191,7 +3227,7 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue=gr.themes.colors.orange, seconda
     with gr.Accordion("See Details", open=False) as see_details:
         with gr.Row():
             is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
-            LLM_model = gr.Dropdown(label="LLM Model", choices=["open-ai-gpt-4o", "anthropic-claude-3-sonnet"], value="open-ai-gpt-4o", visible=True, interactive=True)
         with gr.Tab("逐字稿本文"):
             with gr.Row() as transcript_admmin:
                 transcript_kind = gr.Textbox(value="transcript", show_label=False)

 from educational_material import EducationalMaterial
 from storage_service import GoogleCloudStorage
+from google.cloud import aiplatform
+from vertexai.preview.generative_models import GenerativeModel
+from google.oauth2.service_account import Credentials
 import boto3
 CURRENT_INDEX = 0
 CHAT_LIMIT = 5
+# Google aiplatform
+google_service_account_info_dict = json.loads(GBQ_KEY)
+GOOGPE_SCOPES = ["https://www.googleapis.com/auth/cloud-platform"]
+google_creds = Credentials.from_service_account_info(
+    google_service_account_info_dict, scopes=GOOGPE_SCOPES
+)
+aiplatform.init(
+    project="junyiacademy",
+    service_account=google_service_account_info_dict,
+    credentials=google_creds,
+)
+GEMINI_MODEL = GenerativeModel("gemini-pro")
 # CLIENTS CONFIG
 GBQ_CLIENT = bigquery.Client.from_service_account_info(json.loads(GBQ_KEY))
 GROQ_CLIENT = Groq(api_key=GROQ_API_KEY)
         start_idx = i * part_size
         end_idx = min((i + 1) * part_size, len(data))
         # Serialize the segment back to a JSON string
+        segment = json.dumps(data[start_idx:end_idx]).encode('utf-8').decode('unicode_escape')
         segments.append(segment)
     return segments
 def generate_content_by_open_ai(sys_content, user_content, response_format=None, model_name=None):
+    print("generate_content_by_open_ai")
     if model_name == "gpt-4-turbo":
         model = "gpt-4-turbo"
     else:
         model = "gpt-4o"
+    print(f"LLM model: {model}")
     messages = [
         {"role": "system", "content": sys_content},
 #     content = response_body.get('content')[0].get('text')
 #     return content
+def generate_content_by_gemini(sys_content, user_content, response_format=None, model_name=None):
+    print("generate_content_by_gemini")
+    print(f"LLM using: {model_name}")
+    model_response = GEMINI_MODEL.generate_content(
+        f"{sys_content}, {user_content}"
+    )
+    content = model_response.candidates[0].content.parts[0].text
+    return content
 def generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=None, model_name=None):
     # 使用 OpenAI 生成基于上传数据的问题
+    if LLM_model == "gemini-pro":
+        print(f"LLM: {LLM_model}")
+        content = generate_content_by_gemini(sys_content, user_content, response_format, model_name=model_name)
+    # elif LLM_model == "anthropic-claude-3-sonnet":
     #     print(f"LLM: {LLM_model}")
     #     content = generate_content_by_bedrock(sys_content, user_content)
+    else:
+        print(f"LLM: {LLM_model}")
+        print(f"model_name: {model_name}")
+        content = generate_content_by_open_ai(sys_content, user_content, response_format, model_name=model_name)
     print("=====content=====")
     print(content)
     return reading_passage_json
 def generate_reading_passage(df_string, LLM_model=None):
+    print("===generate_reading_passage 0===")
+    print(df_string)
     segments = split_data(df_string, word_base=100000)
     all_content = []
     model_name = "gpt-4-turbo"
     # model_name = "gpt-4o"
     for segment in segments:
+        sys_content = "你是一個擅長資料分析跟影片教學的老師，user 為學生，請精讀資料文本，自行判斷資料的種類，使用 zh-TW"
         user_content = f"""
+            # 文本 {segment}
+            # rules:
+            - 根據文本，抓取重點
+            - 去除人類講課時口語的問答句，重新拆解成文章，建立適合閱讀語句通順的 Reading Passage
+            - 只需要專注提供 Reading Passage，字數在 500 字以內
+            - 敘述中，請把數學或是專業術語，用 Latex 包覆（$...$）
+            - 加減乘除、根號、次方等等的運算式口語也換成 LATEX 數學符號
+            # restrictions:
+            - 請一定要使用繁體中文 zh-TW，這很重要
+            - 產生的結果不要前後文解釋，也不要敘述這篇文章怎麼產生的
+            - 請直接給出文章，不用介紹怎麼處理的或是文章字數等等
+            - 字數在 500 字以內
         """
+        print("======user_content 0 ===")
+        print(user_content)
         content = generate_content_by_LLM(sys_content, user_content, response_format=None, LLM_model=LLM_model, model_name=model_name)
         all_content.append(content + "\n")
             # 文本：{segment}
             # Rule
+            1. 請根據文本，提取出 5~8 段重點摘要，並給出對應的時間軸，每一段重點的時間軸範圍大於1分鐘，但小於 1/3 總逐字稿長度
             2. 內容當中，如果有列舉方法、模式或是工具，就用 bulletpoint 或是 編號方式 列出，並在列舉部分的頭尾用[]匡列（example: FAANG 是以下五間公司： [1. A公司 2.B公司 3.C公司 4.D公司 5.E公司 ]，...）
             3. 注意不要遺漏任何一段時間軸的內容 從零秒開始，以這種方式分析整個文本，從零秒開始分析，直到結束。這很重要
             4. 結尾的時間如果有總結性的話，也要擷取
             1. 請一定要用 zh-TW，這非常重要！
             2. 如果是疑似主播、主持人的圖片場景，且沒有任何有用的資訊，請不要選取
             3. 如果頭尾的情節不是重點，特別是打招呼或是介紹自己是誰、或是finally say goodbye 就是不重要的情節，就不用擷取
+            4. 時間軸請取到秒數，不要只取到分鐘數，這很重要
             Example: retrun JSON
             {{key_moments:[{{
                 "start": "00:00",
+                "end": "01:35",
                 "text": "逐字稿的重點摘要",
                 "keywords": ["關鍵字", "關鍵字"]
                 }}]
     with gr.Accordion("See Details", open=False) as see_details:
         with gr.Row():
             is_env_prod = gr.Checkbox(value=False, label="is_env_prod")
+            LLM_model = gr.Dropdown(label="LLM Model", choices=["open-ai-gpt-4o", "anthropic-claude-3-sonnet", "gemini-pro"], value="open-ai-gpt-4o", visible=True, interactive=True)
         with gr.Tab("逐字稿本文"):
             with gr.Row() as transcript_admmin:
                 transcript_kind = gr.Textbox(value="transcript", show_label=False)

requirements.txt CHANGED Viewed

@@ -1,4 +1,4 @@
-gradio==4.8.0
 pandas
 openai>=1.16.2
 requests
@@ -12,9 +12,11 @@ google-auth-httplib2
 google-auth-oauthlib
 google-cloud-storage
 google-cloud-bigquery
 groq
 yt_dlp
 uuid
 gtts
 boto3
-pydub

+gradio==4.36.0
 pandas
 openai>=1.16.2
 requests
 google-auth-oauthlib
 google-cloud-storage
 google-cloud-bigquery
+google-cloud-aiplatform
 groq
 yt_dlp
 uuid
 gtts
 boto3
+pydub
+vertexai