Spaces:

huggingface-KREW
/

test-github-CI-for-i18n-agent

Sleeping

App Files Files Community

wony617 commited on Jul 23

Commit

159b6fa

1 Parent(s): 9e33f2c

Add pr duplication check

Browse files

Files changed (7) hide show

README.md +1 -1
agent/handler.py +35 -13
agent/workflow.py +21 -6
app.py +25 -7
translation_result/docs/source/en/accelerator_selection.md +13 -13
translator/content.py +14 -3
translator/retriever.py +39 -0

README.md CHANGED Viewed

@@ -54,7 +54,7 @@ This project was specifically created to solve [Hugging Face Transformers Issue
 ## 🎥 Demo Video
-[![Hugging Face i18n Agent Demo](https://img.youtube.com/vi/YOUR_VIDEO_ID/maxresdefault.jpg)](https://www.youtube.com/watch?v=YOUR_VIDEO_ID)
 *Watch the complete walkthrough: from setup to PR creation in under 5 minutes*

 ## 🎥 Demo Video
+[Hugging Face i18n Agent Demo](https://youtu.be/J2MBMNk7la8?si=7867ztaU2nPN0UEo)
 *Watch the complete walkthrough: from setup to PR creation in under 5 minutes*

agent/handler.py CHANGED Viewed

@@ -8,6 +8,7 @@ import gradio as gr
 from agent.workflow import (
     report_translation_target_files,
     translate_docs_interactive,
     generate_github_pr,
 )
@@ -70,22 +71,29 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
     state.step = "find_files"
     status_report, files_list = report_translation_target_files(lang, k)
-    state.files_to_translate = [file[0] for file in files_list] if files_list else []
     response = f"""**✅ File search completed!**
 **Status Report:**
 {status_report}
 **📁 Found first {len(state.files_to_translate)} files to translate:**
 """
     if state.files_to_translate:
-        for i, file in enumerate(state.files_to_translate[:5], 1):  # Show first 5
             response += f"\n{i}. `{file}`"
-        if len(state.files_to_translate) > 5:
-            response += f"\n... and {len(state.files_to_translate) - 5} more files"
         response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
     else:
@@ -96,7 +104,18 @@ def process_file_search_handler(lang: str, k: int, history: list) -> tuple:
     cleared_input = ""
     selected_tab = 1 if state.files_to_translate else 0
-    return history, cleared_input, update_status(), gr.Tabs(selected=selected_tab)
 def start_translation_process():
@@ -124,18 +143,19 @@ def start_translation_process():
         original_file_link = (
             "https://github.com/huggingface/transformers/blob/main/" + current_file
         )
         response = (
-            f"""🔄 Translation for: `{current_file}`**\n"""
             "**📄 Original Content Link:**\n"
             ""
             f"{original_file_link}\n"
             "**🌐 Translated Content:**\n"
-            f"\n```\n\n{_extract_content_for_display(translated)}```\n"
-            f"{status}\n"
         )
-        print("translated:")
-        print(translated)
-        print("extracted")
     except Exception as e:
         response = f"❌ Translation failed: {str(e)}"
@@ -294,8 +314,10 @@ def send_message(message, history):
 # Button handlers with tab switching
-def start_translate_handler(history, anthropic_key):
     os.environ["ANTHROPIC_API_KEY"] = anthropic_key
     new_hist, cleared_input = handle_user_message("start translation", history)
     selected_tabs = 2 if state.current_file_content["translated"] else 0
     return new_hist, cleared_input, update_status(), gr.Tabs(selected=selected_tabs)

 from agent.workflow import (
     report_translation_target_files,
+    report_in_translation_status_files,
     translate_docs_interactive,
     generate_github_pr,
 )
     state.step = "find_files"
     status_report, files_list = report_translation_target_files(lang, k)
+    in_progress_status_report, in_progress_docs = report_in_translation_status_files(
+        lang
+    )
+    state.files_to_translate = (
+        [file[0] for file in files_list if file[0] not in in_progress_docs]
+        if files_list
+        else []
+    )
     response = f"""**✅ File search completed!**
 **Status Report:**
 {status_report}
+{in_progress_status_report}
 **📁 Found first {len(state.files_to_translate)} files to translate:**
 """
     if state.files_to_translate:
+        for i, file in enumerate(state.files_to_translate, 1):
             response += f"\n{i}. `{file}`"
+        # if len(state.files_to_translate) > 5:
+        #     response += f"\n... and {len(state.files_to_translate) - 5} more files"
         response += "\n\n**🚀 Ready to start translation?**\nI can begin translating these files one by one. Would you like to proceed?"
     else:
     cleared_input = ""
     selected_tab = 1 if state.files_to_translate else 0
+    # 드롭다운 choices로 쓸 파일 리스트 반환 추가
+    return (
+        history,
+        cleared_input,
+        update_status(),
+        gr.Tabs(selected=selected_tab),
+        update_dropdown_choices(state.files_to_translate),
+    )
+def update_dropdown_choices(file_list):
+    return gr.update(choices=file_list, value=None)
 def start_translation_process():
         original_file_link = (
             "https://github.com/huggingface/transformers/blob/main/" + current_file
         )
+        print("Compeleted translation:\n")
+        print(translated)
+        print("----------------------------")
         response = (
+            f"""🔄 Translation for: `{current_file}`\n"""
             "**📄 Original Content Link:**\n"
             ""
             f"{original_file_link}\n"
             "**🌐 Translated Content:**\n"
+            f"\n```\n\n{_extract_content_for_display(translated)}\n```"
+            # f"{status}\n"
+            # "✅ Translation completed. The code block will be added when generating PR."
         )
     except Exception as e:
         response = f"❌ Translation failed: {str(e)}"
 # Button handlers with tab switching
+def start_translate_handler(history, anthropic_key, file_to_translate):
     os.environ["ANTHROPIC_API_KEY"] = anthropic_key
+    state.files_to_translate = [file_to_translate]
     new_hist, cleared_input = handle_user_message("start translation", history)
     selected_tabs = 2 if state.current_file_content["translated"] else 0
     return new_hist, cleared_input, update_status(), gr.Tabs(selected=selected_tabs)

agent/workflow.py CHANGED Viewed

@@ -11,7 +11,7 @@ from translator.content import (
     llm_translate,
     preprocess_content,
 )
-from translator.retriever import report
 # GitHub PR Agent import
 try:
@@ -38,6 +38,19 @@ def report_translation_target_files(
     return status_report, [[file] for file in filepath_list]
 def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
     """Translate documentation."""
     # step 1. Get content from file path
@@ -49,13 +62,17 @@ def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
         translation_lang = "Korean"
     to_translate_with_prompt = get_full_prompt(translation_lang, to_translate)
     # step 3. Translate with LLM
     # TODO: MCP clilent 넘길 부분
     callback_result, translated_content = llm_translate(to_translate_with_prompt)
     # step 4. Add scaffold to translation result
     translated_doc = fill_scaffold(content, to_translate, translated_content)
     return callback_result, translated_doc
@@ -149,9 +166,7 @@ def generate_github_pr(
         print(f"   📁 File: {filepath}")
         print(f"   🌍 Language: {target_language}")
         print(f"   📊 Reference PR: {github_config['reference_pr_url']}")
-        print(
-            f"   🏠 Repository: {github_config['owner']}/{github_config['repo_name']}"
-        )
         agent = GitHubPRAgent()
         result = agent.run_translation_pr_workflow(

     llm_translate,
     preprocess_content,
 )
+from translator.retriever import report, get_github_issue_open_pr
 # GitHub PR Agent import
 try:
     return status_report, [[file] for file in filepath_list]
+def report_in_translation_status_files(translate_lang: str) -> tuple[str, list[str]]:
+    docs, pr_info_list = get_github_issue_open_pr(translate_lang)
+    status_report = ""
+    if docs:
+        status_report = f"""\n🤖 Found {len(docs)} in progress for translation.
+        """
+        for i, file in enumerate(docs):
+            status_report += f"\n{i+1}. `{file}`: {pr_info_list[i]}"
+        status_report += "\n"
+    return status_report, docs
 def translate_docs(lang: str, file_path: str) -> tuple[str, str]:
     """Translate documentation."""
     # step 1. Get content from file path
         translation_lang = "Korean"
     to_translate_with_prompt = get_full_prompt(translation_lang, to_translate)
+    print("to_translate_with_prompt:\n", to_translate_with_prompt)
     # step 3. Translate with LLM
     # TODO: MCP clilent 넘길 부분
     callback_result, translated_content = llm_translate(to_translate_with_prompt)
+    print("translated_content:\n")
+    print(translated_content)
     # step 4. Add scaffold to translation result
     translated_doc = fill_scaffold(content, to_translate, translated_content)
+    print("translated_doc:\n")
+    print(translated_doc)
     return callback_result, translated_doc
         print(f"   📁 File: {filepath}")
         print(f"   🌍 Language: {target_language}")
         print(f"   📊 Reference PR: {github_config['reference_pr_url']}")
+        print(f"   🏠 Repository: {github_config['owner']}/{github_config['repo_name']}")
         agent = GitHubPRAgent()
         result = agent.run_translation_pr_workflow(

app.py CHANGED Viewed

@@ -44,6 +44,8 @@ css = """
     backdrop-filter: blur(8px);
     border: 1px solid rgba(255,255,180,0.25);
     width: 100%;
 }
 .status-card {
     width: 100%
@@ -91,7 +93,6 @@ css = """
 with gr.Blocks(
     css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
 ) as demo:
     # Title
     with open("images/hfkr_logo.png", "rb") as img_file:
         base64_img = base64.b64encode(img_file.read()).decode()
@@ -122,16 +123,15 @@ with gr.Blocks(
                 with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
                     with gr.TabItem("1. Find Files", id=0):
                         with gr.Group():
-                            lang_dropdown = gr.Dropdown(
                                 choices=[language.value for language in Languages],
                                 label="🌍 Translate To",
                                 value="ko",
                             )
                             k_input = gr.Number(
                                 label="📊 First k missing translated docs",
-                                value=1,
                                 minimum=1,
-                                maximum=100,
                             )
                             find_btn = gr.Button(
                                 "🔍 Find Files to Translate",
@@ -140,6 +140,17 @@ with gr.Blocks(
                     with gr.TabItem("2. Translate", id=1):
                         with gr.Group():
                             translate_lang_display = gr.Dropdown(
                                 choices=[language.value for language in Languages],
                                 label="🌍 Translation Language",
@@ -186,7 +197,7 @@ with gr.Blocks(
             # Chat Controller
             with gr.Column(elem_classes=["control-panel"]):
-                gr.Markdown("### 💬 Chat with agent")
                 msg_input = gr.Textbox(
                     placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
                     container=False,
@@ -199,7 +210,7 @@ with gr.Blocks(
     find_btn.click(
         fn=process_file_search_handler,
         inputs=[lang_dropdown, k_input, chatbot],
-        outputs=[chatbot, msg_input, status_display, control_tabs],
     )
     # Sync language across tabs
@@ -209,10 +220,17 @@ with gr.Blocks(
         outputs=[translate_lang_display],
     )
     # Button event handlers
     start_translate_btn.click(
         fn=start_translate_handler,
-        inputs=[chatbot, anthropic_key],
         outputs=[chatbot, msg_input, status_display, control_tabs],
     )

     backdrop-filter: blur(8px);
     border: 1px solid rgba(255,255,180,0.25);
     width: 100%;
+    overflow: visible !important;
 }
 .status-card {
     width: 100%
 with gr.Blocks(
     css=css, title=" 🌐 Hugging Face Transformers Docs i18n made easy"
 ) as demo:
     # Title
     with open("images/hfkr_logo.png", "rb") as img_file:
         base64_img = base64.b64encode(img_file.read()).decode()
                 with gr.Tabs(elem_classes="simple-tabs") as control_tabs:
                     with gr.TabItem("1. Find Files", id=0):
                         with gr.Group():
+                            lang_dropdown = gr.Radio(
                                 choices=[language.value for language in Languages],
                                 label="🌍 Translate To",
                                 value="ko",
                             )
                             k_input = gr.Number(
                                 label="📊 First k missing translated docs",
+                                value=10,
                                 minimum=1,
                             )
                             find_btn = gr.Button(
                                 "🔍 Find Files to Translate",
                     with gr.TabItem("2. Translate", id=1):
                         with gr.Group():
+                            files_to_translate = gr.Radio(
+                                choices=[],
+                                label="📄 Select a file to translate",
+                                interactive=True,
+                                value=[],
+                            )
+                            file_to_translate_input = gr.Textbox(
+                                label="🌍 Select in the dropdown or write the file path to translate",
+                                value="",
+                            )
                             translate_lang_display = gr.Dropdown(
                                 choices=[language.value for language in Languages],
                                 label="🌍 Translation Language",
             # Chat Controller
             with gr.Column(elem_classes=["control-panel"]):
+                gr.Markdown("### 💬 Chat with agent (Only simple chat is available)")
                 msg_input = gr.Textbox(
                     placeholder="Type your message here... (e.g. 'what', 'how', or 'help')",
                     container=False,
     find_btn.click(
         fn=process_file_search_handler,
         inputs=[lang_dropdown, k_input, chatbot],
+        outputs=[chatbot, msg_input, status_display, control_tabs, files_to_translate],
     )
     # Sync language across tabs
         outputs=[translate_lang_display],
     )
+    #
+    files_to_translate.change(
+        fn=lambda x: x,
+        inputs=[files_to_translate],
+        outputs=[file_to_translate_input],
+    )
     # Button event handlers
     start_translate_btn.click(
         fn=start_translate_handler,
+        inputs=[chatbot, anthropic_key, file_to_translate_input],
         outputs=[chatbot, msg_input, status_display, control_tabs],
     )

translation_result/docs/source/en/accelerator_selection.md CHANGED Viewed

@@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.
 # 가속기 선택 [[accelerator-selection]]
-분산 훈련 중에 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 연산 성능을 가진 가속기가 있고 더 빠른 가속기를 먼저 사용하고 싶을 때 유용할 수 있습니다. 또는 사용 가능한 가속기 중 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)이 필요하지 않습니다.
 이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
@@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.
 <hfoptions id="select-accelerator">
 <hfoption id="torchrun">
-`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택하세요.
 ```bash
 torchrun --nproc_per_node=2  trainer-program.py ...
@@ -36,7 +36,7 @@ torchrun --nproc_per_node=2  trainer-program.py ...
 </hfoption>
 <hfoption id="Accelerate">
-`--num_processes`를 사용하여 사용할 가속기 수를 선택하세요.
 ```bash
 accelerate launch --num_processes 2 trainer-program.py ...
@@ -45,7 +45,7 @@ accelerate launch --num_processes 2 trainer-program.py ...
 </hfoption>
 <hfoption id="DeepSpeed">
-`--num_gpus`를 사용하여 사용할 GPU 수를 선택하세요.
 ```bash
 deepspeed --num_gpus 2 trainer-program.py ...
@@ -55,7 +55,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
 </hfoptions>
 ## 가속기 순서 [[order-of-accelerators]]
-사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 각 실행마다 명령줄에서 설정되는 경우가 많지만, `~/.bashrc`나 다른 시작 설정 파일에 추가할 수도 있습니다.
 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
@@ -66,7 +66,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
 CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
 ```
-GPU 0과 2만 PyTorch에 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
 순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
@@ -80,15 +80,15 @@ GPU 없이 실행하려면:
 CUDA_VISIBLE_DEVICES= python trainer-program.py ...
 ```
-`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치 순서를 제어할 수도 있습니다:
-- PCIe 버스 ID 순서로 정렬 (`nvidia-smi`와 일치):
     ```bash
 $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
     ```
-- 연산 성능 순서로 정렬 (가장 빠른 것부터):
     ```bash
     export CUDA_DEVICE_ORDER=FASTEST_FIRST
@@ -101,7 +101,7 @@ $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
 ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
 ```
-XPU 0과 2만 PyTorch에 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
 순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
 ```bash
@@ -109,13 +109,13 @@ ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
 ```
-다음으로 Intel XPU 순서를 제어할 수도 있습니다:
 ```bash
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 ```
-Intel XPU의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
 </hfoption>
 </hfoptions>
@@ -123,5 +123,5 @@ Intel XPU의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero]
 > [!WARNING]
-> 환경 변수는 명령줄에 추가하는 대신 export할 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 결국 잘못된 가속기를 사용하게 될 수 있어 혼란스러울 수 있으므로 권장하지 않습니다. 대신, 동일한 명령줄에서 특정 훈련 실행에 대해 환경 변수를 설정하는 것이 일반적인 관례입니다.
 ```

 # 가속기 선택 [[accelerator-selection]]
+분산 학습 중에는 사용할 가속기(CUDA, XPU, MPS, HPU 등)의 수와 순서를 지정할 수 있습니다. 이는 서로 다른 컴퓨팅 성능을 가진 가속기가 있을 때 더 빠른 가속기를 먼저 사용하고 싶은 경우에 유용할 수 있습니다. 또는 사용 가능한 가속기의 일부만 사용할 수도 있습니다. 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) 모두에서 작동합니다. Accelerate나 [DeepSpeed integration](./main_classes/deepspeed)는 필요하지 않습니다.
 이 가이드는 사용할 가속기의 수와 사용 순서를 선택하는 방법을 보여줍니다.
 <hfoptions id="select-accelerator">
 <hfoption id="torchrun">
+`--nproc_per_node`를 사용하여 사용할 가속기 수를 선택합니다.
 ```bash
 torchrun --nproc_per_node=2  trainer-program.py ...
 </hfoption>
 <hfoption id="Accelerate">
+`--num_processes`를 사용하여 사용할 가속기 수를 선택합니다.
 ```bash
 accelerate launch --num_processes 2 trainer-program.py ...
 </hfoption>
 <hfoption id="DeepSpeed">
+`--num_gpus`를 사용하여 사용할 GPU 수를 선택합니다.
 ```bash
 deepspeed --num_gpus 2 trainer-program.py ...
 </hfoptions>
 ## 가속기 순서 [[order-of-accelerators]]
+사용할 특정 가속기와 그 순서를 선택하려면 하드웨어에 적합한 환경 변수를 사용하세요. 이는 종종 각 실행에 대해 명령줄에서 설정되지만, `~/.bashrc`나 다른 시작 구성 파일에 추가할 수도 있습니다.
 예를 들어, 4개의 가속기(0, 1, 2, 3)가 있고 가속기 0과 2만 실행하고 싶다면:
 CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
 ```
+GPU 0과 2만 PyTorch에서 "보이며" 각각 `cuda:0`과 `cuda:1`로 매핑됩니다.
 순서를 바꾸려면 (GPU 2를 `cuda:0`으로, GPU 0을 `cuda:1`로 사용):
 CUDA_VISIBLE_DEVICES= python trainer-program.py ...
 ```
+`CUDA_DEVICE_ORDER`를 사용하여 CUDA 장치의 순서를 제어할 수도 있습니다:
+- PCIe 버스 ID 순서 (`nvidia-smi`와 일치):
     ```bash
 $hf_i18n_placeholder21export CUDA_DEVICE_ORDER=PCI_BUS_ID
     ```
+- 컴퓨팅 성능 순서 (가장 빠른 것부터):
     ```bash
     export CUDA_DEVICE_ORDER=FASTEST_FIRST
 ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
 ```
+XPU 0과 2만 PyTorch에서 "보이며" 각각 `xpu:0`과 `xpu:1`로 매핑됩니다.
 순서를 바꾸려면 (XPU 2를 `xpu:0`으로, XPU 0을 `xpu:1`로 사용):
 ```bash
 ```
+다음을 사용하여 Intel XPU의 순서를 제어할 수도 있습니다:
 ```bash
 export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
 ```
+Intel XPU에서의 장치 열거 및 정렬에 대한 자세한 정보는 [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) 문서를 참조하세요.
 </hfoption>
 </hfoptions>
 > [!WARNING]
+> 환경 변수는 명령줄에 추가하는 대신 내보낼 수 있습니다. 환경 변수가 어떻게 설정되었는지 잊어버리고 잘못된 가속기를 사용하게 될 수 있어 혼란을 야기할 수 있으므로 권장하지 않습니다. 대신, 같은 명령줄에서 특정 훈련 실행을 위해 환경 변수를 설정하는 것이 일반적인 관례입니다.
 ```

translator/content.py CHANGED Viewed

@@ -5,6 +5,8 @@ import requests
 from langchain.callbacks import get_openai_callback
 from langchain_anthropic import ChatAnthropic
 def get_content(filepath: str) -> str:
     url = string.Template(
@@ -38,10 +40,11 @@ def get_full_prompt(language: str, to_translate: str) -> str:
         "What do these sentences about Hugging Face Transformers "
         "(a machine learning library) mean in $language? "
         "Please do not translate the word after a 🤗 emoji "
-        "as it is a product name. Output only the translated markdown result "
-        "without any explanations or introductions.\n\n```md"
     ).safe_substitute(language=language)
-    return "\n".join([prompt, to_translate.strip(), "```"])
 def split_markdown_sections(markdown: str) -> list:
@@ -64,15 +67,23 @@ def make_scaffold(content: str, to_translate: str) -> string.Template:
     scaffold = content
     for i, text in enumerate(to_translate.split("\n\n")):
         scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
     return string.Template(scaffold)
 def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
     scaffold = make_scaffold(content, to_translate)
     divided = split_markdown_sections(to_translate)
     anchors = get_anchors(divided)
     translated = split_markdown_sections(translated)
     translated[1::3] = [
         f"{korean_title} {anchors[i]}"

 from langchain.callbacks import get_openai_callback
 from langchain_anthropic import ChatAnthropic
+from translator.prompt_glossary import PROMPT_WITH_GLOSSARY
 def get_content(filepath: str) -> str:
     url = string.Template(
         "What do these sentences about Hugging Face Transformers "
         "(a machine learning library) mean in $language? "
         "Please do not translate the word after a 🤗 emoji "
+        "as it is a product name. Output the complete markdown file**, with prose translated and all other content intact"
+        "No explanations or extras—only the translated markdown"
+        "\n\n```md"
     ).safe_substitute(language=language)
+    return "\n".join([prompt, to_translate.strip(), "```", PROMPT_WITH_GLOSSARY])
 def split_markdown_sections(markdown: str) -> list:
     scaffold = content
     for i, text in enumerate(to_translate.split("\n\n")):
         scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1)
+    print("inner scaffold:")
+    print(scaffold)
     return string.Template(scaffold)
 def fill_scaffold(content: str, to_translate: str, translated: str) -> str:
     scaffold = make_scaffold(content, to_translate)
+    print("scaffold:")
+    print(scaffold.template)
     divided = split_markdown_sections(to_translate)
+    print("divided:")
+    print(divided)
     anchors = get_anchors(divided)
     translated = split_markdown_sections(translated)
+    print("translated:")
+    print(translated)
     translated[1::3] = [
         f"{korean_title} {anchors[i]}"

translator/retriever.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 from pathlib import Path
@@ -25,6 +26,44 @@ def get_github_repo_files():
     return file_paths
 def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
     """
     Retrieve missing docs

+import re
 import os
 from pathlib import Path
     return file_paths
+def get_github_issue_open_pr(lang: str = "ko"):
+    """
+    Get open PR in the github issue, filtered by title starting with '🌐 [i18n-KO]'.
+    """
+    if lang == "ko":
+        issue_id = "20179"
+    else:
+        raise ValueError(
+            "No Github issue has been registered to the server. (Only 'ko' is supported - please contact us to support this.)"
+        )
+    url = f"https://api.github.com/repos/huggingface/transformers/pulls?state=open"
+    headers = {
+        "Accept": "application/vnd.github+json",
+    }
+    response = requests.get(url, headers=headers)
+    if response.status_code != 200:
+        raise Exception(f"GitHub API error: {response.status_code} {response.text}")
+    open_prs = response.json()
+    filtered_prs = [pr for pr in open_prs if pr["title"].startswith("🌐 [i18n-KO]")]
+    pattern = re.compile(r"`([^`]+\.md)`")
+    filenames = [
+        "docs/source/en/" + match.group(1)
+        for pr in filtered_prs
+        if (match := pattern.search(pr["title"]))
+    ]
+    pr_info_list = [
+        f"https://github.com/huggingface/transformers/pull/{pr["url"].rstrip('/').split('/')[-1]}"
+        for pr in filtered_prs
+    ]
+    return filenames, pr_info_list
 def retrieve(summary: Summary, table_size: int = 10) -> tuple[str, list[str]]:
     """
     Retrieve missing docs