HunyuanVideo-Foley

Running on Zero

tchung1970 Claude commited on 2 days ago

Commit

c97cd22

1 Parent(s): 3121e9b

Localize Gradio interface to Korean

- Translated all UI text strings from English to Korean
- Updated header title and description
- Localized input labels: video upload, audio description, CFG scale, steps, sample numbers
- Translated button text and status messages
- Updated quick start guide and examples section
- Localized all logging messages and error messages
- Maintained English text prompts as they are expected by the model

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <[email protected]>

Files changed (1) hide show

app.py +56 -56

app.py CHANGED Viewed

@@ -26,10 +26,10 @@ MODEL_PATH = os.environ.get("HIFI_FOLEY_MODEL_PATH", "./pretrained_models/")
 CONFIG_PATH = "configs/hunyuanvideo-foley-xxl.yaml"
 def download_model_from_hf(repo_id: str = "tencent/HunyuanVideo-Foley", local_dir: str = "./pretrained_models") -> str:
-    """从HuggingFace自动下载模型到本地目录"""
     try:
-        logger.info(f"开始从HuggingFace下载模型：{repo_id}")
-        logger.info(f"下载目标目录：{local_dir}")
         # 确保本地目录存在
         os.makedirs(local_dir, exist_ok=True)
@@ -42,11 +42,11 @@ def download_model_from_hf(repo_id: str = "tencent/HunyuanVideo-Foley", local_di
             local_files_only=False,  # 允许从网络下载
         )
-        logger.info(f"✅ 模型下载成功！保存在：{local_dir}")
-        return f"✅ 模型从 {repo_id} 下载成功！"
     except Exception as e:
-        error_msg = f"❌ 模型下载失败：{str(e)}"
         logger.error(error_msg)
         return error_msg
@@ -72,48 +72,48 @@ def setup_device(device_str: str = "auto", gpu_id: int = 0) -> torch.device:
     return device
 def auto_load_models() -> str:
-    """Automatically load preset models"""
     global model_dict, cfg, device
     try:
-        # 如果模型路径不存在，尝试从HuggingFace下载
         if not os.path.exists(MODEL_PATH):
-            logger.info(f"模型路径 {MODEL_PATH} 不存在，开始从HuggingFace下载...")
             download_result = download_model_from_hf(local_dir=MODEL_PATH.rstrip('/'))
-            if "失败" in download_result:
                 return download_result
-        # 如果配置文件不存在，也尝试从HuggingFace下载
         if not os.path.exists(CONFIG_PATH):
-            logger.info(f"配置文件 {CONFIG_PATH} 不存在，尝试从HuggingFace下载...")
-            # 如果是从pretrained_models/配置路径，也尝试下载
             if CONFIG_PATH.startswith("configs/"):
                 config_dir = os.path.dirname(CONFIG_PATH)
                 if not os.path.exists(config_dir):
                     download_result = download_model_from_hf(local_dir="./")
-                    if "失败" in download_result:
                         return download_result
-            # 最后检查配置文件是否存在
             if not os.path.exists(CONFIG_PATH):
-                return f"❌ 配置文件未找到: {CONFIG_PATH}"
         # Use GPU by default
         device = setup_device("auto", 0)
-        # Load model
-        logger.info("正在加载模型...")
-        logger.info(f"模型路径: {MODEL_PATH}")
-        logger.info(f"配置路径: {CONFIG_PATH}")
         model_dict, cfg = load_model(MODEL_PATH, CONFIG_PATH, device)
-        logger.info("✅ 模型加载成功!")
-        return "✅ 模型加载成功!"
     except Exception as e:
-        logger.error(f"模型加载失败: {str(e)}")
-        return f"❌ 模型加载失败: {str(e)}"
 @spaces.GPU(duration=120)
 @torch.inference_mode()
@@ -128,10 +128,10 @@ def infer_single_video(
     global model_dict, cfg, device
     if model_dict is None or cfg is None:
-        return [], "❌ Please load the model first!"
     if video_file is None:
-        return [], "❌ Please upload a video file!"
     # Allow empty text prompt, use empty string if no prompt provided
     if text_prompt is None:
@@ -153,7 +153,7 @@ def infer_single_video(
         # Denoising process to generate multiple audio samples
         # Note: The model now generates sample_nums audio samples per inference
         # The denoise_process function returns audio with shape [batch_size, channels, samples]
-        logger.info(f"Generating {sample_nums} audio samples...")
         audio, sample_rate = denoise_process(
             visual_feats,
             text_feats,
@@ -180,12 +180,12 @@ def infer_single_video(
             merge_audio_video(audio_output, video_file, video_output)
             video_outputs.append(video_output)
-        logger.info(f"Inference completed! Generated {sample_nums} samples.")
-        return video_outputs, f"✅ Generated {sample_nums} audio sample(s) successfully!"
     except Exception as e:
-        logger.error(f"Inference failed: {str(e)}")
-        return [], f"❌ Inference failed: {str(e)}"
 def update_video_outputs(video_list, status_msg):
     """Update video outputs based on the number of generated samples"""
@@ -538,31 +538,31 @@ def create_gradio_interface():
         with gr.Column(elem_classes=["main-header"]):
             gr.HTML("""
             <h1>🎵 HunyuanVideo-Foley</h1>
-            <p>Text-Video-to-Audio Synthesis: Generate realistic audio from video and text descriptions</p>
             """)
         # Usage Guide
         with gr.Column(elem_classes=["status-card"]):
             gr.Markdown("""
-            ### 📋 Quick Start Guide
-            **1.** Upload your video file\t**2.** Add optional text description\t**3.** Adjust sample numbers (1-6)\t**4.** Click Generate Audio
-            💡 For quick start, you can load the prepared examples by clicking the button.
             """, elem_classes=["usage-guide"])
         # Main inference interface - Input and Results side by side
         with gr.Row(elem_classes=["main-interface"]):
             # Input section
             with gr.Column(scale=1, elem_classes=["input-section"]):
-                gr.Markdown("### 📹 Video Input")
                 video_input = gr.Video(
-                    label="Upload Video",
                     height=300
                 )
                 text_input = gr.Textbox(
-                    label="🎯 Audio Description (English)",
                     placeholder="A person walks on frozen ice",
                     lines=3,
                 )
@@ -573,7 +573,7 @@ def create_gradio_interface():
                         maximum=10.0,
                         value=4.5,
                         step=0.1,
-                        label="🎚️ CFG Scale",
                     )
                     inference_steps = gr.Slider(
@@ -581,7 +581,7 @@ def create_gradio_interface():
                         maximum=100,
                         value=50,
                         step=5,
-                        label="⚡ Steps",
                     )
                     sample_nums = gr.Slider(
@@ -589,24 +589,24 @@ def create_gradio_interface():
                         maximum=6,
                         value=1,
                         step=1,
-                        label="🎲 Sample Nums",
                     )
                 generate_btn = gr.Button(
-                    "🎵 Generate Audio",
                     variant="primary",
                     elem_classes=["generate-btn"]
                 )
             # Results section
             with gr.Column(scale=1, elem_classes=["output-section"]):
-                gr.Markdown("### 🎥 Generated Results")
                 # Multi-video gallery for displaying multiple generated samples
                 with gr.Column():
                     # Primary video (Sample 1)
                     video_output_1 = gr.Video(
-                        label="Sample 1",
                         height=250,
                         visible=True
                     )
@@ -615,44 +615,44 @@ def create_gradio_interface():
                     with gr.Row(elem_classes=["additional-samples"]):
                         with gr.Column(scale=1):
                             video_output_2 = gr.Video(
-                                label="Sample 2",
                                 height=150,
                                 visible=False
                             )
                             video_output_3 = gr.Video(
-                                label="Sample 3",
                                 height=150,
                                 visible=False
                             )
                         with gr.Column(scale=1):
                             video_output_4 = gr.Video(
-                                label="Sample 4",
                                 height=150,
                                 visible=False
                             )
                             video_output_5 = gr.Video(
-                                label="Sample 5",
                                 height=150,
                                 visible=False
                             )
                     # Sample 6 - full width
                     video_output_6 = gr.Video(
-                        label="Sample 6",
                         height=150,
                         visible=False
                     )
                 result_text = gr.Textbox(
-                    label="Status",
                     interactive=False,
                     lines=2
                 )
         # Examples section at the bottom
         with gr.Column(elem_classes=["examples-section"]):
-            gr.Markdown("### 🌟 Examples")
-            gr.Markdown("Click on any example to load it into the interface above")
             # Define your custom examples here - 8 examples total
             examples_data = [
@@ -741,7 +741,7 @@ def create_gradio_interface():
                                 # Load button
                                 example_btn = gr.Button(
-                                    f"Load Example {idx+1}",
                                     variant="secondary",
                                     size="sm"
                                 )
@@ -821,7 +821,7 @@ def create_gradio_interface():
         # Footer
         gr.HTML("""
         <div class="footer-text">
-            <p>🚀 Powered by HunyuanVideo-Foley | Generate high-quality audio from video and text descriptions</p>
         </div>
         """)
@@ -839,7 +839,7 @@ if __name__ == "__main__":
     logger.add(lambda msg: print(msg, end=''), level="INFO")
     # Auto-load model
-    logger.info("Starting application and loading model...")
     model_load_result = auto_load_models()
     logger.info(model_load_result)
@@ -848,7 +848,7 @@ if __name__ == "__main__":
     # Log completion status
     if "successfully" in model_load_result:
-        logger.info("Application ready, model loaded")
     # Test
     app.launch(

 CONFIG_PATH = "configs/hunyuanvideo-foley-xxl.yaml"
 def download_model_from_hf(repo_id: str = "tencent/HunyuanVideo-Foley", local_dir: str = "./pretrained_models") -> str:
+    """HuggingFace에서 모델을 로컬 디렉토리로 자동 다운로드"""
     try:
+        logger.info(f"HuggingFace에서 모델 다운로드 시작: {repo_id}")
+        logger.info(f"다운로드 대상 디렉토리: {local_dir}")
         # 确保本地目录存在
         os.makedirs(local_dir, exist_ok=True)
             local_files_only=False,  # 允许从网络下载
         )
+        logger.info(f"✅ 모델 다운로드 성공! 저장 위치: {local_dir}")
+        return f"✅ {repo_id}에서 모델 다운로드 성공!"
     except Exception as e:
+        error_msg = f"❌ 모델 다운로드 실패: {str(e)}"
         logger.error(error_msg)
         return error_msg
     return device
 def auto_load_models() -> str:
+    """사전 설정된 모델을 자동으로 로드"""
     global model_dict, cfg, device
     try:
+        # 모델 경로가 존재하지 않으면 HuggingFace에서 다운로드 시도
         if not os.path.exists(MODEL_PATH):
+            logger.info(f"모델 경로 {MODEL_PATH}가 존재하지 않아 HuggingFace에서 다운로드 시작...")
             download_result = download_model_from_hf(local_dir=MODEL_PATH.rstrip('/'))
+            if "실패" in download_result:
                 return download_result
+        # 구성 파일이 존재하지 않으면 HuggingFace에서 다운로드 시도
         if not os.path.exists(CONFIG_PATH):
+            logger.info(f"구성 파일 {CONFIG_PATH}가 존재하지 않아 HuggingFace에서 다운로드 시도...")
+            # pretrained_models/ 구성 경로인 경우 다운로드 시도
             if CONFIG_PATH.startswith("configs/"):
                 config_dir = os.path.dirname(CONFIG_PATH)
                 if not os.path.exists(config_dir):
                     download_result = download_model_from_hf(local_dir="./")
+                    if "실패" in download_result:
                         return download_result
+            # 구성 파일 존재 여부 최종 확인
             if not os.path.exists(CONFIG_PATH):
+                return f"❌ 구성 파일을 찾을 수 없음: {CONFIG_PATH}"
         # Use GPU by default
         device = setup_device("auto", 0)
+        # 모델 로드
+        logger.info("모델 로딩 중...")
+        logger.info(f"모델 경로: {MODEL_PATH}")
+        logger.info(f"구성 경로: {CONFIG_PATH}")
         model_dict, cfg = load_model(MODEL_PATH, CONFIG_PATH, device)
+        logger.info("✅ 모델 로딩 성공!")
+        return "✅ 모델 로딩 성공!"
     except Exception as e:
+        logger.error(f"모델 로딩 실패: {str(e)}")
+        return f"❌ 모델 로딩 실패: {str(e)}"
 @spaces.GPU(duration=120)
 @torch.inference_mode()
     global model_dict, cfg, device
     if model_dict is None or cfg is None:
+        return [], "❌ 먼저 모델을 로드해주세요!"
     if video_file is None:
+        return [], "❌ 비디오 파일을 업로드해주세요!"
     # Allow empty text prompt, use empty string if no prompt provided
     if text_prompt is None:
         # Denoising process to generate multiple audio samples
         # Note: The model now generates sample_nums audio samples per inference
         # The denoise_process function returns audio with shape [batch_size, channels, samples]
+        logger.info(f"{sample_nums}개 오디오 샘플 생성 중...")
         audio, sample_rate = denoise_process(
             visual_feats,
             text_feats,
             merge_audio_video(audio_output, video_file, video_output)
             video_outputs.append(video_output)
+        logger.info(f"추론 완료! {sample_nums}개 샘플 생성됨.")
+        return video_outputs, f"✅ {sample_nums}개 오디오 샘플이 성공적으로 생성되었습니다!"
     except Exception as e:
+        logger.error(f"추론 실패: {str(e)}")
+        return [], f"❌ 추론 실패: {str(e)}"
 def update_video_outputs(video_list, status_msg):
     """Update video outputs based on the number of generated samples"""
         with gr.Column(elem_classes=["main-header"]):
             gr.HTML("""
             <h1>🎵 HunyuanVideo-Foley</h1>
+            <p>텍스트-비디오-오디오 합성: 비디오와 텍스트 설명에서 사실적인 오디오 생성</p>
             """)
         # Usage Guide
         with gr.Column(elem_classes=["status-card"]):
             gr.Markdown("""
+            ### 📋 빠른 시작 가이드
+            **1.** 비디오 파일 업로드\t**2.** 선택적 텍스트 설명 추가\t**3.** 샘플 수 조정 (1-6)\t**4.** 오디오 생성 클릭
+            💡 빠른 시작을 위해 버튼을 클릭하여 준비된 예제를 로드할 수 있습니다.
             """, elem_classes=["usage-guide"])
         # Main inference interface - Input and Results side by side
         with gr.Row(elem_classes=["main-interface"]):
             # Input section
             with gr.Column(scale=1, elem_classes=["input-section"]):
+                gr.Markdown("### 📹 비디오 입력")
                 video_input = gr.Video(
+                    label="비디오 업로드",
                     height=300
                 )
                 text_input = gr.Textbox(
+                    label="🎯 오디오 설명 (영어)",
                     placeholder="A person walks on frozen ice",
                     lines=3,
                 )
                         maximum=10.0,
                         value=4.5,
                         step=0.1,
+                        label="🎚️ CFG 스케일",
                     )
                     inference_steps = gr.Slider(
                         maximum=100,
                         value=50,
                         step=5,
+                        label="⚡ 단계",
                     )
                     sample_nums = gr.Slider(
                         maximum=6,
                         value=1,
                         step=1,
+                        label="🎲 샘플 수",
                     )
                 generate_btn = gr.Button(
+                    "🎵 오디오 생성",
                     variant="primary",
                     elem_classes=["generate-btn"]
                 )
             # Results section
             with gr.Column(scale=1, elem_classes=["output-section"]):
+                gr.Markdown("### 🎥 생성 결과")
                 # Multi-video gallery for displaying multiple generated samples
                 with gr.Column():
                     # Primary video (Sample 1)
                     video_output_1 = gr.Video(
+                        label="샘플 1",
                         height=250,
                         visible=True
                     )
                     with gr.Row(elem_classes=["additional-samples"]):
                         with gr.Column(scale=1):
                             video_output_2 = gr.Video(
+                                label="샘플 2",
                                 height=150,
                                 visible=False
                             )
                             video_output_3 = gr.Video(
+                                label="샘플 3",
                                 height=150,
                                 visible=False
                             )
                         with gr.Column(scale=1):
                             video_output_4 = gr.Video(
+                                label="샘플 4",
                                 height=150,
                                 visible=False
                             )
                             video_output_5 = gr.Video(
+                                label="샘플 5",
                                 height=150,
                                 visible=False
                             )
                     # Sample 6 - full width
                     video_output_6 = gr.Video(
+                        label="샘플 6",
                         height=150,
                         visible=False
                     )
                 result_text = gr.Textbox(
+                    label="상태",
                     interactive=False,
                     lines=2
                 )
         # Examples section at the bottom
         with gr.Column(elem_classes=["examples-section"]):
+            gr.Markdown("### 🌟 예제")
+            gr.Markdown("위 인터페이스로 로드하려면 예제를 클릭하세요")
             # Define your custom examples here - 8 examples total
             examples_data = [
                                 # Load button
                                 example_btn = gr.Button(
+                                    f"예제 {idx+1} 로드",
                                     variant="secondary",
                                     size="sm"
                                 )
         # Footer
         gr.HTML("""
         <div class="footer-text">
+            <p>🚀 HunyuanVideo-Foley로 구동 | 비디오와 텍스트 설명에서 고품질 오디오 생성</p>
         </div>
         """)
     logger.add(lambda msg: print(msg, end=''), level="INFO")
     # Auto-load model
+    logger.info("애플리케이션 시작 및 모델 로딩...")
     model_load_result = auto_load_models()
     logger.info(model_load_result)
     # Log completion status
     if "successfully" in model_load_result:
+        logger.info("애플리케이션 준비 완료, 모델 로드 완료")
     # Test
     app.launch(