|  | import re | 
					
						
						|  | import string | 
					
						
						|  |  | 
					
						
						|  | import requests | 
					
						
						|  | from langchain.callbacks import get_openai_callback | 
					
						
						|  | from langchain_anthropic import ChatAnthropic | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_content(filepath: str) -> str: | 
					
						
						|  | url = string.Template( | 
					
						
						|  | "https://raw.githubusercontent.com/huggingface/" "transformers/main/$filepath" | 
					
						
						|  | ).safe_substitute(filepath=filepath) | 
					
						
						|  | response = requests.get(url) | 
					
						
						|  | if response.status_code == 200: | 
					
						
						|  | content = response.text | 
					
						
						|  | return content | 
					
						
						|  | else: | 
					
						
						|  | raise ValueError("Failed to retrieve content from the URL.", url) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def preprocess_content(content: str) -> str: | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | to_translate = content[content.find("#") :] | 
					
						
						|  |  | 
					
						
						|  | to_translate = re.sub(r"```.*?```", "", to_translate, flags=re.DOTALL) | 
					
						
						|  |  | 
					
						
						|  | to_translate = re.sub(r"^\|.*\|$\n?", "", to_translate, flags=re.MULTILINE) | 
					
						
						|  |  | 
					
						
						|  | to_translate = re.sub(r"\n\n+", "\n\n", to_translate) | 
					
						
						|  |  | 
					
						
						|  | return to_translate | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_full_prompt(language: str, to_translate: str) -> str: | 
					
						
						|  | prompt = string.Template( | 
					
						
						|  | "What do these sentences about Hugging Face Transformers " | 
					
						
						|  | "(a machine learning library) mean in $language? " | 
					
						
						|  | "Please do not translate the word after a 🤗 emoji " | 
					
						
						|  | "as it is a product name. Output only the translated markdown result " | 
					
						
						|  | "without any explanations or introductions.\n\n```md" | 
					
						
						|  | ).safe_substitute(language=language) | 
					
						
						|  | return "\n".join([prompt, to_translate.strip(), "```"]) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def split_markdown_sections(markdown: str) -> list: | 
					
						
						|  |  | 
					
						
						|  | return re.split(r"^(#+\s+)(.*)$", markdown, flags=re.MULTILINE)[1:] | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def get_anchors(divided: list) -> list: | 
					
						
						|  | anchors = [] | 
					
						
						|  |  | 
					
						
						|  | for title in divided[1::3]: | 
					
						
						|  | anchor = re.sub(r"[^a-z0-9\s]+", "", title.lower()) | 
					
						
						|  | anchor = re.sub(r"\s{2,}", " ", anchor.strip()).replace(" ", "-") | 
					
						
						|  | anchors.append(f"[[{anchor}]]") | 
					
						
						|  | return anchors | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def make_scaffold(content: str, to_translate: str) -> string.Template: | 
					
						
						|  | scaffold = content | 
					
						
						|  | for i, text in enumerate(to_translate.split("\n\n")): | 
					
						
						|  | scaffold = scaffold.replace(text, f"$hf_i18n_placeholder{i}", 1) | 
					
						
						|  | return string.Template(scaffold) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def fill_scaffold(content: str, to_translate: str, translated: str) -> str: | 
					
						
						|  | scaffold = make_scaffold(content, to_translate) | 
					
						
						|  | divided = split_markdown_sections(to_translate) | 
					
						
						|  | anchors = get_anchors(divided) | 
					
						
						|  |  | 
					
						
						|  | translated = split_markdown_sections(translated) | 
					
						
						|  |  | 
					
						
						|  | translated[1::3] = [ | 
					
						
						|  | f"{korean_title} {anchors[i]}" | 
					
						
						|  | for i, korean_title in enumerate(translated[1::3]) | 
					
						
						|  | ] | 
					
						
						|  | translated = "".join( | 
					
						
						|  | ["".join(translated[i * 3 : i * 3 + 3]) for i in range(len(translated) // 3)] | 
					
						
						|  | ).split("\n\n") | 
					
						
						|  | if newlines := scaffold.template.count("$hf_i18n_placeholder") - len(translated): | 
					
						
						|  | return str( | 
					
						
						|  | [ | 
					
						
						|  | f"Please {'recover' if newlines > 0 else 'remove'} " | 
					
						
						|  | f"{abs(newlines)} incorrectly inserted double newlines." | 
					
						
						|  | ] | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | translated_doc = scaffold.safe_substitute( | 
					
						
						|  | {f"hf_i18n_placeholder{i}": text for i, text in enumerate(translated)} | 
					
						
						|  | ) | 
					
						
						|  | return translated_doc | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def llm_translate(to_translate: str) -> tuple[str, str]: | 
					
						
						|  | with get_openai_callback() as cb: | 
					
						
						|  | model = ChatAnthropic( | 
					
						
						|  | model="claude-sonnet-4-20250514", max_tokens=64000, streaming=True | 
					
						
						|  | ) | 
					
						
						|  | ai_message = model.invoke(to_translate) | 
					
						
						|  | print("cb:", cb) | 
					
						
						|  | return cb, ai_message.content | 
					
						
						|  |  |