Spaces:
Runtime error
Runtime error
| import spaces | |
| import gradio as gr | |
| import os | |
| import numpy as np | |
| from pydub import AudioSegment | |
| import hashlib | |
| from sonic import Sonic | |
| from PIL import Image | |
| import torch | |
| # 모델 초기화 | |
| cmd = ( | |
| 'python3 -m pip install "huggingface_hub[cli]"; ' | |
| 'huggingface-cli download LeonJoe13/Sonic --local-dir checkpoints; ' | |
| 'huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt --local-dir checkpoints/stable-video-diffusion-img2vid-xt; ' | |
| 'huggingface-cli download openai/whisper-tiny --local-dir checkpoints/whisper-tiny;' | |
| ) | |
| os.system(cmd) | |
| pipe = Sonic() | |
| def get_md5(content): | |
| md5hash = hashlib.md5(content) | |
| return md5hash.hexdigest() | |
| # 긴 비디오 처리를 위해 duration 300초로 설정 | |
| def get_video_res(img_path, audio_path, res_video_path, dynamic_scale=1.0): | |
| expand_ratio = 0.5 | |
| min_resolution = 512 | |
| inference_steps = 25 # 2초 분량의 비디오(25 프레임)로 고정 | |
| # 오디오 길이(참고용) 출력 | |
| audio = AudioSegment.from_file(audio_path) | |
| duration = len(audio) / 1000.0 # 초 단위 | |
| print(f"Audio duration: {duration} seconds, using inference_steps: {inference_steps}") | |
| face_info = pipe.preprocess(img_path, expand_ratio=expand_ratio) | |
| print(f"Face detection info: {face_info}") | |
| if face_info['face_num'] > 0: | |
| crop_image_path = img_path + '.crop.png' | |
| pipe.crop_image(img_path, crop_image_path, face_info['crop_bbox']) | |
| img_path = crop_image_path | |
| os.makedirs(os.path.dirname(res_video_path), exist_ok=True) | |
| # 고정된 inference_steps(25)로 비디오 생성 | |
| pipe.process( | |
| img_path, | |
| audio_path, | |
| res_video_path, | |
| min_resolution=min_resolution, | |
| inference_steps=inference_steps, | |
| dynamic_scale=dynamic_scale | |
| ) | |
| return res_video_path | |
| else: | |
| return -1 | |
| tmp_path = './tmp_path/' | |
| res_path = './res_path/' | |
| os.makedirs(tmp_path, exist_ok=True) | |
| os.makedirs(res_path, exist_ok=True) | |
| def process_sonic(image, audio, dynamic_scale): | |
| # 입력 검증 | |
| if image is None: | |
| raise gr.Error("Please upload an image") | |
| if audio is None: | |
| raise gr.Error("Please upload an audio file") | |
| img_md5 = get_md5(np.array(image)) | |
| audio_md5 = get_md5(audio[1]) | |
| print(f"Processing with image hash: {img_md5}, audio hash: {audio_md5}") | |
| sampling_rate, arr = audio[:2] | |
| if len(arr.shape) == 1: | |
| arr = arr[:, None] | |
| # numpy array로부터 AudioSegment 생성 | |
| audio_segment = AudioSegment( | |
| arr.tobytes(), | |
| frame_rate=sampling_rate, | |
| sample_width=arr.dtype.itemsize, | |
| channels=arr.shape[1] | |
| ) | |
| audio_segment = audio_segment.set_frame_rate(sampling_rate) | |
| # 파일 경로 생성 | |
| image_path = os.path.abspath(os.path.join(tmp_path, f'{img_md5}.png')) | |
| audio_path = os.path.abspath(os.path.join(tmp_path, f'{audio_md5}.wav')) | |
| res_video_path = os.path.abspath(os.path.join(res_path, f'{img_md5}_{audio_md5}_{dynamic_scale}.mp4')) | |
| # 입력 파일이 없으면 저장 | |
| if not os.path.exists(image_path): | |
| image.save(image_path) | |
| if not os.path.exists(audio_path): | |
| audio_segment.export(audio_path, format="wav") | |
| # 캐시된 결과가 있으면 반환, 없으면 새로 생성 | |
| if os.path.exists(res_video_path): | |
| print(f"Using cached result: {res_video_path}") | |
| return res_video_path | |
| else: | |
| print(f"Generating new video with dynamic scale: {dynamic_scale}") | |
| return get_video_res(image_path, audio_path, res_video_path, dynamic_scale) | |
| # 예시 데이터를 위한 dummy 함수 (필요시 실제 예시 데이터를 추가하세요) | |
| def get_example(): | |
| return [] | |
| css = """ | |
| .gradio-container { | |
| font-family: 'Arial', sans-serif; | |
| } | |
| .main-header { | |
| text-align: center; | |
| color: #2a2a2a; | |
| margin-bottom: 2em; | |
| } | |
| .parameter-section { | |
| background-color: #f5f5f5; | |
| padding: 1em; | |
| border-radius: 8px; | |
| margin: 1em 0; | |
| } | |
| .example-section { | |
| margin-top: 2em; | |
| } | |
| """ | |
| with gr.Blocks(css=css,theme="apriel") as demo: | |
| gr.HTML(""" | |
| <div class="main-header"> | |
| <h1>🎭 Sonic: Advanced Portrait Animation</h1> | |
| <p>Transform still images into dynamic videos synchronized with audio</p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| image_input = gr.Image( | |
| type='pil', | |
| label="Portrait Image", | |
| elem_id="image_input" | |
| ) | |
| audio_input = gr.Audio( | |
| label="Voice/Audio Input", | |
| elem_id="audio_input", | |
| type="numpy" | |
| ) | |
| with gr.Column(): | |
| dynamic_scale = gr.Slider( | |
| minimum=0.5, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label="Animation Intensity", | |
| info="Adjust to control movement intensity (0.5: subtle, 2.0: dramatic)" | |
| ) | |
| process_btn = gr.Button( | |
| "Generate Animation", | |
| variant="primary", | |
| elem_id="process_btn" | |
| ) | |
| with gr.Column(): | |
| video_output = gr.Video( | |
| label="Generated Animation", | |
| elem_id="video_output" | |
| ) | |
| process_btn.click( | |
| fn=process_sonic, | |
| inputs=[image_input, audio_input, dynamic_scale], | |
| outputs=video_output, | |
| api_name="animate" | |
| ) | |
| gr.Examples( | |
| examples=get_example(), | |
| fn=process_sonic, | |
| inputs=[image_input, audio_input, dynamic_scale], | |
| outputs=video_output, | |
| cache_examples=False | |
| ) | |
| # 공개 링크 생성: share=True | |
| demo.launch(share=True) |