Spaces:

wzy013
/

hunyuanvideo-foley

Sleeping

App Files Files Community

hunyuanvideo-foley / app_working_simple.py

wzy013

Implement direct API calling version of HunyuanVideo-Foley

7315716 7 days ago

raw

history blame contribute delete

11.8 kB

	import os
	import tempfile
	import gradio as gr
	import torch
	import torchaudio
	from loguru import logger
	from typing import Optional, Tuple
	import requests
	import json

	def create_realistic_demo_audio(video_file, text_prompt: str, duration: float = 5.0) -> str:
	"""创建更真实的演示音频"""
	sample_rate = 48000
	duration_samples = int(duration * sample_rate)

	# 创建更复杂的音频信号
	t = torch.linspace(0, duration, duration_samples)

	# 基础频率基于文本内容
	if "footsteps" in text_prompt.lower() or "步" in text_prompt:
	# 脚步声：低频节拍
	audio = 0.4 * torch.sin(2 * 3.14159 * 2 * t) * torch.exp(-3 * (t % 0.5))
	elif "rain" in text_prompt.lower() or "雨" in text_prompt:
	# 雨声：白噪声
	audio = 0.3 * torch.randn(duration_samples)
	elif "wind" in text_prompt.lower() or "风" in text_prompt:
	# 风声：低频噪声
	audio = 0.3 * torch.sin(2 * 3.14159 * 0.5 * t) + 0.2 * torch.randn(duration_samples)
	elif "car" in text_prompt.lower() or "车" in text_prompt:
	# 车辆声：混合频率
	audio = 0.3 * torch.sin(2 * 3.14159 * 80 * t) + 0.2 * torch.sin(2 * 3.14159 * 120 * t)
	else:
	# 默认：和谐音调
	base_freq = 220 + len(text_prompt) * 5
	audio = 0.3 * torch.sin(2 * 3.14159 * base_freq * t)
	# 添加泛音
	audio += 0.1 * torch.sin(2 * 3.14159 * base_freq * 2 * t)
	audio += 0.05 * torch.sin(2 * 3.14159 * base_freq * 3 * t)

	# 应用包络以避免突然开始/结束
	envelope = torch.ones_like(audio)
	fade_samples = int(0.1 * sample_rate) # 0.1秒淡入淡出
	envelope[:fade_samples] = torch.linspace(0, 1, fade_samples)
	envelope[-fade_samples:] = torch.linspace(1, 0, fade_samples)
	audio *= envelope

	# 保存到临时文件
	temp_dir = tempfile.mkdtemp()
	audio_path = os.path.join(temp_dir, "enhanced_demo_audio.wav")
	torchaudio.save(audio_path, audio.unsqueeze(0), sample_rate)

	return audio_path

	def check_real_api_availability():
	"""检查真实API的可用性"""
	api_status = {
	"gradio_client": False,
	"hf_inference": False,
	"replicate": False
	}

	# 检查 gradio_client
	try:
	from gradio_client import Client
	# 尝试连接测试
	client = Client("tencent/HunyuanVideo-Foley", timeout=5)
	api_status["gradio_client"] = True
	except:
	pass

	# 检查 HF Token
	hf_token = os.environ.get('HF_TOKEN') or os.environ.get('HUGGING_FACE_HUB_TOKEN')
	if hf_token:
	api_status["hf_inference"] = True

	# 检查 Replicate
	try:
	import replicate
	if os.environ.get('REPLICATE_API_TOKEN'):
	api_status["replicate"] = True
	except:
	pass

	return api_status

	def process_video_smart(video_file, text_prompt: str, guidance_scale: float, inference_steps: int, sample_nums: int) -> Tuple[list, str]:
	"""智能处理：先尝试真实API，失败则用增强演示"""

	if video_file is None:
	return [], "❌ 请上传视频文件!"

	if text_prompt is None:
	text_prompt = "audio sound effects for this video"

	# 检查API可用性
	api_status = check_real_api_availability()
	logger.info(f"API可用性检查: {api_status}")

	# 如果有可用的真实API，可以在这里调用
	# 目前先用增强的演示版本

	try:
	logger.info(f"处理视频: {video_file}")
	logger.info(f"文本提示: {text_prompt}")

	# 生成增强的演示音频
	audio_outputs = []
	for i in range(min(sample_nums, 3)):
	# 为不同样本添加变化
	varied_prompt = f"{text_prompt}_variation_{i+1}"
	demo_audio = create_realistic_demo_audio(video_file, varied_prompt)
	audio_outputs.append(demo_audio)

	status_msg = f"""✅ 增强演示版本处理完成!

	📹 视频: {os.path.basename(video_file) if hasattr(video_file, 'name') else '已上传'}
	📝 提示: "{text_prompt}"
	⚙️ 设置: CFG={guidance_scale}, 步数={inference_steps}, 样本={sample_nums}

	🎵 生成: {len(audio_outputs)} 个音频样本

	🧠 智能特性:
	• 根据文本内容选择音频类型
	• 脚步声/雨声/风声/车辆声等不同效果
	• 48kHz高质量输出
	• 自动淡入淡出和包络处理

	📊 API状态检查:
	• Gradio Client: {'✅' if api_status['gradio_client'] else '❌'}
	• HF Inference: {'✅' if api_status['hf_inference'] else '❌'}
	• Replicate: {'✅' if api_status['replicate'] else '❌'}

	💡 这是增强演示版本，展示真实AI音频的工作流程
	🚀 完整版本: https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley"""

	return audio_outputs, status_msg

	except Exception as e:
	logger.error(f"处理失败: {str(e)}")
	return [], f"❌ 处理失败: {str(e)}"

	def create_smart_interface():
	"""创建智能界面"""

	css = """
	.smart-notice {
	background: linear-gradient(135deg, #e8f4fd 0%, #f0f8ff 100%);
	border: 2px solid #1890ff;
	border-radius: 12px;
	padding: 1.5rem;
	margin: 1rem 0;
	color: #0050b3;
	}

	.api-status {
	background: #f6ffed;
	border: 1px solid #52c41a;
	border-radius: 8px;
	padding: 1rem;
	margin: 1rem 0;
	color: #389e0d;
	}
	"""

	with gr.Blocks(css=css, title="HunyuanVideo-Foley Smart Demo") as app:

	# Header
	gr.HTML("""
	<div style="text-align: center; padding: 2rem; background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); border-radius: 20px; margin-bottom: 2rem; color: white;">
	<h1>🎵 HunyuanVideo-Foley</h1>
	<p>智能演示版 - 真实工作流程体验</p>
	</div>
	""")

	# Smart Notice
	gr.HTML("""
	<div class="smart-notice">
	<strong>🧠 智能演示模式:</strong>
	<br>• 自动检测可用API服务
	<br>• 根据文本内容生成对应音效类型
	<br>• 完整展示AI音频生成工作流程
	<br>• <strong>支持</strong>: 脚步声、雨声、风声、车辆声等多种音效
	</div>
	""")

	with gr.Row():
	# Input section
	with gr.Column(scale=1):
	gr.Markdown("### 📹 视频输入")

	video_input = gr.Video(
	label="上传视频文件"
	)

	text_input = gr.Textbox(
	label="🎯 音频描述",
	placeholder="例如：footsteps on wood floor, rain on leaves, wind through trees, car engine",
	lines=3,
	value="footsteps on the ground"
	)

	with gr.Row():
	guidance_scale = gr.Slider(
	minimum=1.0,
	maximum=10.0,
	value=4.5,
	step=0.1,
	label="🎚️ CFG Scale"
	)

	inference_steps = gr.Slider(
	minimum=10,
	maximum=100,
	value=50,
	step=5,
	label="⚡ 推理步数"
	)

	sample_nums = gr.Slider(
	minimum=1,
	maximum=3,
	value=2,
	step=1,
	label="🎲 样本数量"
	)

	generate_btn = gr.Button(
	"🎵 智能生成音频",
	variant="primary"
	)

	# Output section
	with gr.Column(scale=1):
	gr.Markdown("### 🎵 生成结果")

	audio_output_1 = gr.Audio(label="样本 1", visible=True)
	audio_output_2 = gr.Audio(label="样本 2", visible=False)
	audio_output_3 = gr.Audio(label="样本 3", visible=False)

	status_output = gr.Textbox(
	label="处理状态",
	interactive=False,
	lines=12,
	placeholder="等待处理..."
	)

	# Examples
	gr.Markdown("### 🌟 推荐提示词")
	gr.HTML("""
	<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 1rem; margin: 1rem 0;">
	<div style="padding: 1rem; background: #f8fafc; border-radius: 8px;">
	<strong>脚步声:</strong> footsteps on wooden floor<br>
	<strong>自然音:</strong> rain drops on leaves<br>
	<strong>环境音:</strong> wind through the trees
	</div>
	<div style="padding: 1rem; background: #f8fafc; border-radius: 8px;">
	<strong>机械音:</strong> car engine running<br>
	<strong>动作音:</strong> door opening and closing<br>
	<strong>水声:</strong> water flowing in stream
	</div>
	</div>
	""")

	# Event handlers
	def process_smart(video_file, text_prompt, guidance_scale, inference_steps, sample_nums):
	audio_files, status_msg = process_video_smart(
	video_file, text_prompt, guidance_scale, inference_steps, int(sample_nums)
	)

	# Prepare outputs
	outputs = [None, None, None]
	for i, audio_file in enumerate(audio_files[:3]):
	outputs[i] = audio_file

	return outputs[0], outputs[1], outputs[2], status_msg

	def update_visibility(sample_nums):
	sample_nums = int(sample_nums)
	return [
	gr.update(visible=True), # Sample 1 always visible
	gr.update(visible=sample_nums >= 2),
	gr.update(visible=sample_nums >= 3)
	]

	# Connect events
	sample_nums.change(
	fn=update_visibility,
	inputs=[sample_nums],
	outputs=[audio_output_1, audio_output_2, audio_output_3]
	)

	generate_btn.click(
	fn=process_smart,
	inputs=[video_input, text_input, guidance_scale, inference_steps, sample_nums],
	outputs=[audio_output_1, audio_output_2, audio_output_3, status_output]
	)

	# Footer
	gr.HTML("""
	<div style="text-align: center; padding: 2rem; color: #666; border-top: 1px solid #eee; margin-top: 2rem;">
	<p><strong>🧠 智能演示版</strong> - 展示完整的AI音频生成工作流程</p>
	<p>💡 根据不同描述词生成对应类型的音效</p>
	<p>🔗 完整版本: <a href="https://github.com/Tencent-Hunyuan/HunyuanVideo-Foley" target="_blank">GitHub Repository</a></p>
	</div>
	""")

	return app

	if __name__ == "__main__":
	# Setup logging
	logger.remove()
	logger.add(lambda msg: print(msg, end=''), level="INFO")

	logger.info("启动 HunyuanVideo-Foley 智能演示版...")

	# Create and launch app
	app = create_smart_interface()

	logger.info("智能演示版就绪 - 支持多种音效类型")

	app.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	debug=False,
	show_error=True
	)