Spaces:

jbilcke-hf
/

OmniAvatar

Running on L40S

App Files Files Community

jbilcke-hf HF Staff commited on Jul 29

Commit

37a6639

1 Parent(s): dd2d897

big refactoring

Browse files

Files changed (1) hide show

app.py +455 -117

app.py CHANGED Viewed

@@ -1,21 +1,53 @@
 import gradio as gr
-import subprocess
 import os
 import tempfile
 import shutil
 from pathlib import Path
 import torch
 import logging
 from huggingface_hub import snapshot_download
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # Constants
 DEFAULT_CONFIG_PATH = "configs/inference_1.3B.yaml"
-DEFAULT_INPUT_FILE = "examples/infer_samples.txt"
-MODELS_DIR = Path("pretrained_models")
 def download_models():
     """Download required models if they don't exist"""
@@ -61,17 +93,344 @@ def download_models():
             logger.error(f"Failed to download {model['name']}: {str(e)}")
             raise gr.Error(f"Failed to download {model['name']}: {str(e)}")
-# Initialize models on module import (for Hugging Face Spaces)
-logger.info("Initializing OmniAvatar...")
-logger.info("Checking and downloading required models...")
-download_models()
-logger.info("Model initialization complete")
 def generate_avatar_video(
     reference_image,
     audio_file,
     text_prompt,
-    seed=42,
     num_steps=15,
     guidance_scale=4.5,
     audio_scale=None,
@@ -81,144 +440,116 @@ def generate_avatar_video(
     resolution="720p",
     progress=gr.Progress()
 ):
-    """Generate an avatar video using OmniAvatar
-    Args:
-        reference_image: Path to reference avatar image
-        audio_file: Path to audio file for lip sync
-        text_prompt: Text description of the video to generate
-        seed: Random seed for generation
-        num_steps: Number of inference steps
-        guidance_scale: Classifier-free guidance scale
-        audio_scale: Audio guidance scale (uses guidance_scale if None)
-        overlap_frames: Number of overlapping frames between chunks
-        fps: Frames per second
-        silence_duration: Duration of silence to add before/after audio
-        resolution: Output resolution ("480p" or "720p")
-        progress: Gradio progress callback
-    Returns:
-        str: Path to generated video file
-    """
     try:
-        progress(0.1, desc="Preparing inputs")
-        # Create temporary directory for this generation
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_path = Path(temp_dir)
             # Copy input files to temp directory
             temp_image = temp_path / "input_image.jpeg"
             temp_audio = temp_path / "input_audio.mp3"
             shutil.copy(reference_image, temp_image)
             shutil.copy(audio_file, temp_audio)
-            # Create input file for inference script
-            input_file = temp_path / "input.txt"
-            # Format: prompt@@image_path@@audio_path
-            with open(input_file, 'w') as f:
-                f.write(f"{text_prompt}@@{temp_image}@@{temp_audio}\n")
-            progress(0.2, desc="Configuring generation parameters")
-            # Determine max_hw based on resolution
-            max_hw = 720 if resolution == "480p" else 1280
-            # Build command to run inference script
-            cmd = [
-                "torchrun",
-                "--nproc_per_node=1",
-                "scripts/inference.py",
-                "--config", DEFAULT_CONFIG_PATH,
-                "--input_file", str(input_file),
-                "-hp", f"seed={seed},num_steps={num_steps},guidance_scale={guidance_scale},"
-                      f"overlap_frame={overlap_frames},fps={fps},silence_duration_s={silence_duration},"
-                      f"max_hw={max_hw},use_audio=True,i2v=True"
-            ]
-            # Add audio scale if specified
-            if audio_scale is not None:
-                cmd[-1] += f",audio_scale={audio_scale}"
-            progress(0.3, desc="Running OmniAvatar generation")
-            logger.info(f"Running command: {' '.join(cmd)}")
-            # Run the inference script
-            env = os.environ.copy()
-            env['CUDA_VISIBLE_DEVICES'] = '0'  # Use first GPU
-            process = subprocess.Popen(
-                cmd,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                text=True,
-                env=env
             )
-            # Monitor progress (simplified - in reality you'd parse the output)
-            stdout_lines = []
-            stderr_lines = []
-            while True:
-                output = process.stdout.readline()
-                if output:
-                    stdout_lines.append(output.strip())
-                    logger.info(output.strip())
-                    # Update progress based on output
-                    if "Starting video generation" in output:
-                        progress(0.5, desc="Generating video frames")
-                    elif "[1/" in output:  # First chunk
-                        progress(0.6, desc="Processing video chunks")
-                    elif "Saving video" in output:
-                        progress(0.9, desc="Finalizing video")
-                if process.poll() is not None:
-                    break
-            # Get any remaining output
-            remaining_stdout, remaining_stderr = process.communicate()
-            if remaining_stdout:
-                stdout_lines.extend(remaining_stdout.strip().split('\n'))
-            if remaining_stderr:
-                stderr_lines.extend(remaining_stderr.strip().split('\n'))
-            if process.returncode != 0:
-                error_msg = '\n'.join(stderr_lines)
-                logger.error(f"Inference failed with return code {process.returncode}")
-                logger.error(f"Error output: {error_msg}")
-                raise gr.Error(f"Video generation failed: {error_msg}")
-            progress(0.95, desc="Retrieving generated video")
             # Find the generated video file
-            # The inference script saves to demo_out/{exp_name}/res_{input_file_name}_...
-            # We need to find the most recent video file
-            generated_videos = list(Path("demo_out").rglob("result_000.mp4"))
             if not generated_videos:
                 raise gr.Error("No video file was generated")
-            # Get the most recent video
-            latest_video = max(generated_videos, key=lambda p: p.stat().st_mtime)
-            # Create a temporary file for the output video
-            # This file will persist beyond the context manager since we're using delete=False
             with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_output:
                 output_path = tmp_output.name
-            # Copy the generated video to the temporary file
             shutil.copy(latest_video, output_path)
             progress(1.0, desc="Generation complete")
-            logger.info(f"Video saved to temporary path: {output_path}")
-            return output_path
     except Exception as e:
-        logger.error(f"Error generating video: {str(e)}")
         raise gr.Error(f"Error generating video: {str(e)}")
 # Create the Gradio interface
 with gr.Blocks(title="OmniAvatar - Lipsynced Avatar Video Generation") as app:
     gr.Markdown("""
@@ -252,12 +583,17 @@ with gr.Blocks(title="OmniAvatar - Lipsynced Avatar Video Generation") as app:
             with gr.Accordion("Advanced Settings", open=False):
                 with gr.Row():
                     seed = gr.Slider(
-                        label="Seed",
-                        minimum=-1,
                         maximum=2147483647,
                         step=1,
-                        value=-1
                     )
                     resolution = gr.Radio(
@@ -322,6 +658,12 @@ with gr.Blocks(title="OmniAvatar - Lipsynced Avatar Video Generation") as app:
                 "🎬 Generate Avatar Video",
                 variant="primary"
             )
         with gr.Column(scale=1):
             # Output component
@@ -351,6 +693,7 @@ with gr.Blocks(title="OmniAvatar - Lipsynced Avatar Video Generation") as app:
             audio_file,
             text_prompt,
             seed,
             num_steps,
             guidance_scale,
             audio_scale,
@@ -359,7 +702,7 @@ with gr.Blocks(title="OmniAvatar - Lipsynced Avatar Video Generation") as app:
             silence_duration,
             resolution
         ],
-        outputs=output_video
     )
     gr.Markdown("""
@@ -372,9 +715,4 @@ with gr.Blocks(title="OmniAvatar - Lipsynced Avatar Video Generation") as app:
 # Launch the app
 if __name__ == "__main__":
-    # Download models on startup
-    logger.info("Checking and downloading required models...")
-    download_models()
-    logger.info("Model download complete, launching app...")
     app.launch(share=True)

 import gradio as gr
 import os
+import sys
 import tempfile
 import shutil
 from pathlib import Path
 import torch
 import logging
 from huggingface_hub import snapshot_download
+import math
+import random
+import librosa
+import numpy as np
+import torch.nn as nn
+from tqdm import tqdm
+from functools import partial
+from datetime import datetime
+import torchvision.transforms as TT
+from transformers import Wav2Vec2FeatureExtractor
+import torchvision.transforms as transforms
+import torch.nn.functional as F
+from glob import glob
+# Add parent directory to path for imports
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from OmniAvatar.utils.args_config import parse_args
+from OmniAvatar.utils.io_utils import load_state_dict
+from peft import LoraConfig, inject_adapter_in_model
+from OmniAvatar.models.model_manager import ModelManager
+from OmniAvatar.wan_video import WanVideoPipeline
+from OmniAvatar.utils.io_utils import save_video_as_grid_and_mp4
+import torch.distributed as dist
+from OmniAvatar.utils.audio_preprocess import add_silence_to_audio_ffmpeg
+from OmniAvatar.distributed.fsdp import shard_model
 # Configure logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
 # Constants
+MODELS_DIR = Path(os.environ.get('MODELS_DIR', 'pretrained_models'))
 DEFAULT_CONFIG_PATH = "configs/inference_1.3B.yaml"
+def set_seed(seed: int = 42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
 def download_models():
     """Download required models if they don't exist"""
             logger.error(f"Failed to download {model['name']}: {str(e)}")
             raise gr.Error(f"Failed to download {model['name']}: {str(e)}")
+# Utility functions from inference.py
+def match_size(image_size, h, w):
+    ratio_ = 9999
+    size_ = 9999
+    select_size = None
+    for image_s in image_size:
+        ratio_tmp = abs(image_s[0] / image_s[1] - h / w)
+        size_tmp = abs(max(image_s) - max(w, h))
+        if ratio_tmp < ratio_:
+            ratio_ = ratio_tmp
+            size_ = size_tmp
+            select_size = image_s
+        if ratio_ == ratio_tmp:
+            if size_ == size_tmp:
+                select_size = image_s
+    return select_size
+def resize_pad(image, ori_size, tgt_size):
+    h, w = ori_size
+    scale_ratio = max(tgt_size[0] / h, tgt_size[1] / w)
+    scale_h = int(h * scale_ratio)
+    scale_w = int(w * scale_ratio)
+    image = transforms.Resize(size=[scale_h, scale_w])(image)
+    padding_h = tgt_size[0] - scale_h
+    padding_w = tgt_size[1] - scale_w
+    pad_top = padding_h // 2
+    pad_bottom = padding_h - pad_top
+    pad_left = padding_w // 2
+    pad_right = padding_w - pad_left
+    image = F.pad(image, (pad_left, pad_right, pad_top, pad_bottom), mode='constant', value=0)
+    return image
+class WanInferencePipeline(nn.Module):
+    def __init__(self, args):
+        super().__init__()
+        self.args = args
+        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        if args.dtype=='bf16':
+            self.dtype = torch.bfloat16
+        elif args.dtype=='fp16':
+            self.dtype = torch.float16
+        else:
+            self.dtype = torch.float32
+        self.pipe = self.load_model()
+        if args.i2v:
+            chained_trainsforms = []
+            chained_trainsforms.append(TT.ToTensor())
+            self.transform = TT.Compose(chained_trainsforms)
+        if args.use_audio:
+            from OmniAvatar.models.wav2vec import Wav2VecModel
+            self.wav_feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+                    str(MODELS_DIR / "wav2vec2-base-960h")
+                )
+            self.audio_encoder = Wav2VecModel.from_pretrained(str(MODELS_DIR / "wav2vec2-base-960h"), local_files_only=True).to(device=self.device)
+            self.audio_encoder.feature_extractor._freeze_parameters()
+    def load_model(self):
+        # Initialize for single GPU
+        os.environ['MASTER_ADDR'] = 'localhost'
+        os.environ['MASTER_PORT'] = '12355'
+        os.environ['RANK'] = '0'
+        os.environ['WORLD_SIZE'] = '1'
+        dist.init_process_group(backend="nccl", init_method="env://")
+        from xfuser.core.distributed import (initialize_model_parallel,
+                                            init_distributed_environment)
+        init_distributed_environment(rank=0, world_size=1)
+        initialize_model_parallel(
+            sequence_parallel_degree=self.args.sp_size,
+            ring_degree=1,
+            ulysses_degree=self.args.sp_size,
+        )
+        torch.cuda.set_device(0)
+        ckpt_path = f'{self.args.exp_path}/pytorch_model.pt'
+        assert os.path.exists(ckpt_path), f"pytorch_model.pt not found in {self.args.exp_path}"
+        if self.args.train_architecture == 'lora':
+            self.args.pretrained_lora_path = pretrained_lora_path = ckpt_path
+        else:
+            resume_path = ckpt_path
+        self.step = 0
+        # Load models
+        model_manager = ModelManager(device="cpu", infer=True)
+        model_manager.load_models(
+            [
+                self.args.dit_path.split(","),
+                self.args.text_encoder_path,
+                self.args.vae_path
+            ],
+            torch_dtype=self.dtype,
+            device='cpu',
+        )
+        pipe = WanVideoPipeline.from_model_manager(model_manager,
+                                                torch_dtype=self.dtype,
+                                                device=str(self.device),
+                                                use_usp=True if self.args.sp_size > 1 else False,
+                                                infer=True)
+        if self.args.train_architecture == "lora":
+            logger.info(f'Use LoRA: lora rank: {self.args.lora_rank}, lora alpha: {self.args.lora_alpha}')
+            self.add_lora_to_model(
+                    pipe.denoising_model(),
+                    lora_rank=self.args.lora_rank,
+                    lora_alpha=self.args.lora_alpha,
+                    lora_target_modules=self.args.lora_target_modules,
+                    init_lora_weights=self.args.init_lora_weights,
+                    pretrained_lora_path=pretrained_lora_path,
+                )
+        else:
+            missing_keys, unexpected_keys = pipe.denoising_model().load_state_dict(load_state_dict(resume_path), strict=True)
+            logger.info(f"load from {resume_path}, {len(missing_keys)} missing keys, {len(unexpected_keys)} unexpected keys")
+        pipe.requires_grad_(False)
+        pipe.eval()
+        pipe.enable_vram_management(num_persistent_param_in_dit=self.args.num_persistent_param_in_dit)
+        if self.args.use_fsdp:
+            shard_fn = partial(shard_model, device_id=self.device)
+            pipe.dit = shard_fn(pipe.dit)
+        return pipe
+    def add_lora_to_model(self, model, lora_rank=4, lora_alpha=4, lora_target_modules="q,k,v,o,ffn.0,ffn.2", init_lora_weights="kaiming", pretrained_lora_path=None, state_dict_converter=None):
+        self.lora_alpha = lora_alpha
+        if init_lora_weights == "kaiming":
+            init_lora_weights = True
+        lora_config = LoraConfig(
+            r=lora_rank,
+            lora_alpha=lora_alpha,
+            init_lora_weights=init_lora_weights,
+            target_modules=lora_target_modules.split(","),
+        )
+        model = inject_adapter_in_model(lora_config, model)
+        if pretrained_lora_path is not None:
+            state_dict = load_state_dict(pretrained_lora_path)
+            if state_dict_converter is not None:
+                state_dict = state_dict_converter(state_dict)
+            missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+            all_keys = [i for i, _ in model.named_parameters()]
+            num_updated_keys = len(all_keys) - len(missing_keys)
+            num_unexpected_keys = len(unexpected_keys)
+            logger.info(f"{num_updated_keys} parameters are loaded from {pretrained_lora_path}. {num_unexpected_keys} parameters are unexpected.")
+    def forward(self, prompt,
+                image_path=None,
+                audio_path=None,
+                seq_len=101,
+                height=720,
+                width=720,
+                overlap_frame=None,
+                num_steps=None,
+                negative_prompt=None,
+                guidance_scale=None,
+                audio_scale=None):
+        overlap_frame = overlap_frame if overlap_frame is not None else self.args.overlap_frame
+        num_steps = num_steps if num_steps is not None else self.args.num_steps
+        negative_prompt = negative_prompt if negative_prompt is not None else self.args.negative_prompt
+        guidance_scale = guidance_scale if guidance_scale is not None else self.args.guidance_scale
+        audio_scale = audio_scale if audio_scale is not None else self.args.audio_scale
+        if image_path is not None:
+            from PIL import Image
+            image = Image.open(image_path).convert("RGB")
+            image = self.transform(image).unsqueeze(0).to(self.device)
+            _, _, h, w = image.shape
+            select_size = match_size(getattr(self.args, f'image_sizes_{self.args.max_hw}'), h, w)
+            image = resize_pad(image, (h, w), select_size)
+            image = image * 2.0 - 1.0
+            image = image[:, :, None]
+        else:
+            image = None
+            select_size = [height, width]
+        L = int(self.args.max_tokens * 16 * 16 * 4 / select_size[0] / select_size[1])
+        L = L // 4 * 4 + 1 if L % 4 != 0 else L - 3  # video frames
+        T = (L + 3) // 4  # latent frames
+        if self.args.i2v:
+            if self.args.random_prefix_frames:
+                fixed_frame = overlap_frame
+                assert fixed_frame % 4 == 1
+            else:
+                fixed_frame = 1
+            prefix_lat_frame = (3 + fixed_frame) // 4
+            first_fixed_frame = 1
+        else:
+            fixed_frame = 0
+            prefix_lat_frame = 0
+            first_fixed_frame = 0
+        if audio_path is not None and self.args.use_audio:
+            audio, sr = librosa.load(audio_path, sr=self.args.sample_rate)
+            input_values = np.squeeze(
+                    self.wav_feature_extractor(audio, sampling_rate=16000).input_values
+                )
+            input_values = torch.from_numpy(input_values).float().to(device=self.device)
+            ori_audio_len = audio_len = math.ceil(len(input_values) / self.args.sample_rate * self.args.fps)
+            input_values = input_values.unsqueeze(0)
+            # padding audio
+            if audio_len < L - first_fixed_frame:
+                audio_len = audio_len + ((L - first_fixed_frame) - audio_len % (L - first_fixed_frame))
+            elif (audio_len - (L - first_fixed_frame)) % (L - fixed_frame) != 0:
+                audio_len = audio_len + ((L - fixed_frame) - (audio_len - (L - first_fixed_frame)) % (L - fixed_frame))
+            input_values = F.pad(input_values, (0, audio_len * int(self.args.sample_rate / self.args.fps) - input_values.shape[1]), mode='constant', value=0)
+            with torch.no_grad():
+                hidden_states = self.audio_encoder(input_values, seq_len=audio_len, output_hidden_states=True)
+                audio_embeddings = hidden_states.last_hidden_state
+                for mid_hidden_states in hidden_states.hidden_states:
+                    audio_embeddings = torch.cat((audio_embeddings, mid_hidden_states), -1)
+            seq_len = audio_len
+            audio_embeddings = audio_embeddings.squeeze(0)
+            audio_prefix = torch.zeros_like(audio_embeddings[:first_fixed_frame])
+        else:
+            audio_embeddings = None
+        # loop
+        times = (seq_len - L + first_fixed_frame) // (L-fixed_frame) + 1
+        if times * (L-fixed_frame) + fixed_frame < seq_len:
+            times += 1
+        video = []
+        image_emb = {}
+        img_lat = None
+        if self.args.i2v:
+            self.pipe.load_models_to_device(['vae'])
+            img_lat = self.pipe.encode_video(image.to(dtype=self.dtype)).to(self.device)
+            msk = torch.zeros_like(img_lat.repeat(1, 1, T, 1, 1)[:,:1])
+            image_cat = img_lat.repeat(1, 1, T, 1, 1)
+            msk[:, :, 1:] = 1
+            image_emb["y"] = torch.cat([image_cat, msk], dim=1)
+        for t in range(times):
+            logger.info(f"[{t+1}/{times}]")
+            audio_emb = {}
+            if t == 0:
+                overlap = first_fixed_frame
+            else:
+                overlap = fixed_frame
+                image_emb["y"][:, -1:, :prefix_lat_frame] = 0
+            prefix_overlap = (3 + overlap) // 4
+            if audio_embeddings is not None:
+                if t == 0:
+                    audio_tensor = audio_embeddings[
+                            :min(L - overlap, audio_embeddings.shape[0])
+                        ]
+                else:
+                    audio_start = L - first_fixed_frame + (t - 1) * (L - overlap)
+                    audio_tensor = audio_embeddings[
+                        audio_start: min(audio_start + L - overlap, audio_embeddings.shape[0])
+                    ]
+                audio_tensor = torch.cat([audio_prefix, audio_tensor], dim=0)
+                audio_prefix = audio_tensor[-fixed_frame:]
+                audio_tensor = audio_tensor.unsqueeze(0).to(device=self.device, dtype=self.dtype)
+                audio_emb["audio_emb"] = audio_tensor
+            else:
+                audio_prefix = None
+            if image is not None and img_lat is None:
+                self.pipe.load_models_to_device(['vae'])
+                img_lat = self.pipe.encode_video(image.to(dtype=self.dtype)).to(self.device)
+                assert img_lat.shape[2] == prefix_overlap
+            img_lat = torch.cat([img_lat, torch.zeros_like(img_lat[:, :, :1].repeat(1, 1, T - prefix_overlap, 1, 1))], dim=2)
+            frames, _, latents = self.pipe.log_video(img_lat, prompt, prefix_overlap, image_emb, audio_emb,
+                                                 negative_prompt, num_inference_steps=num_steps,
+                                                 cfg_scale=guidance_scale, audio_cfg_scale=audio_scale if audio_scale is not None else guidance_scale,
+                                                 return_latent=True,
+                                                 tea_cache_l1_thresh=self.args.tea_cache_l1_thresh,tea_cache_model_id="Wan2.1-T2V-14B")
+            img_lat = None
+            image = (frames[:, -fixed_frame:].clip(0, 1) * 2 - 1).permute(0, 2, 1, 3, 4).contiguous()
+            if t == 0:
+                video.append(frames)
+            else:
+                video.append(frames[:, overlap:])
+        video = torch.cat(video, dim=1)
+        video = video[:, :ori_audio_len + 1]
+        return video
+# Initialize the pipeline globally
+inference_pipeline = None
+args_global = None
+def initialize_inference_pipeline():
+    """Initialize the inference pipeline with arguments"""
+    global inference_pipeline, args_global
+    if inference_pipeline is not None:
+        return inference_pipeline
+    # Create a minimal args object
+    class Args:
+        def __init__(self):
+            self.rank = 0
+            self.dtype = 'bf16'
+            self.exp_path = str(MODELS_DIR / "OmniAvatar-1.3B")
+            self.dit_path = str(MODELS_DIR / "Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors")
+            self.text_encoder_path = str(MODELS_DIR / "Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth")
+            self.vae_path = str(MODELS_DIR / "Wan2.1-T2V-1.3B/Wan2.1_VAE.pth")
+            self.wav2vec_path = str(MODELS_DIR / "wav2vec2-base-960h")
+            self.train_architecture = 'lora'
+            self.lora_rank = 128
+            self.lora_alpha = 64.0
+            self.lora_target_modules = 'q,k,v,o,ffn.0,ffn.2'
+            self.init_lora_weights = 'kaiming'
+            self.sp_size = 1
+            self.num_persistent_param_in_dit = None
+            self.use_fsdp = False
+            self.i2v = True
+            self.use_audio = True
+            self.random_prefix_frames = True
+            self.overlap_frame = 13
+            self.num_steps = 15
+            self.negative_prompt = 'Vivid color tones, background/camera moving quickly, screen switching, subtitles and special effects, mutation, overexposed, static, blurred details, subtitles, style, work, painting, image, still, overall grayish, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn face, deformed, disfigured, malformed limbs, fingers merging, motionless image, chaotic background, three legs, crowded background with many people, walking backward'
+            self.guidance_scale = 4.5
+            self.audio_scale = 0
+            self.max_tokens = 30000
+            self.sample_rate = 16000
+            self.fps = 25
+            self.max_hw = 720
+            self.tea_cache_l1_thresh = 0
+            self.image_sizes_720 = [[400, 720], [720, 720], [720, 400]]
+            self.image_sizes_1280 = [[720, 720], [528, 960], [960, 528], [720, 1280], [1280, 720]]
+            self.seq_len = 200
+    args_global = Args()
+    logger.info("Initializing inference pipeline...")
+    inference_pipeline = WanInferencePipeline(args_global)
+    logger.info("Inference pipeline initialized successfully")
+    return inference_pipeline
 def generate_avatar_video(
     reference_image,
     audio_file,
     text_prompt,
+    seed=None,
+    use_random_seed=True,
     num_steps=15,
     guidance_scale=4.5,
     audio_scale=None,
     resolution="720p",
     progress=gr.Progress()
 ):
+    """Generate an avatar video using OmniAvatar"""
     try:
+        progress(0.1, desc="Initializing")
+        if use_random_seed or seed is None or seed == -1:
+            seed = random.randint(0, 2147483647)
+        set_seed(seed)
+        # Initialize pipeline if needed
+        pipeline = initialize_inference_pipeline()
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_path = Path(temp_dir)
+            progress(0.2, desc="Preparing inputs")
             # Copy input files to temp directory
             temp_image = temp_path / "input_image.jpeg"
             temp_audio = temp_path / "input_audio.mp3"
             shutil.copy(reference_image, temp_image)
             shutil.copy(audio_file, temp_audio)
+            # Add silence to audio
+            if silence_duration > 0:
+                audio_with_silence = temp_path / "audio_with_silence.wav"
+                add_silence_to_audio_ffmpeg(str(temp_audio), str(audio_with_silence), silence_duration)
+                input_audio_path = str(audio_with_silence)
+            else:
+                input_audio_path = str(temp_audio)
+            progress(0.3, desc="Configuring generation parameters")
+            # Update args for this generation
+            args_global.seed = seed
+            args_global.num_steps = num_steps
+            args_global.guidance_scale = guidance_scale
+            args_global.audio_scale = audio_scale if audio_scale is not None and audio_scale > 0 else 0
+            args_global.overlap_frame = overlap_frames
+            args_global.fps = fps
+            args_global.silence_duration_s = silence_duration
+            args_global.max_hw = 720 if resolution == "480p" else 1280
+            progress(0.4, desc="Running OmniAvatar generation")
+            # Generate video
+            video = pipeline(
+                prompt=text_prompt,
+                image_path=str(temp_image),
+                audio_path=input_audio_path,
+                seq_len=args_global.seq_len
             )
+            progress(0.8, desc="Saving video")
+            # Create output directory in temp folder
+            output_dir = temp_path / "output"
+            output_dir.mkdir(exist_ok=True)
+            # Add audio offset for final output
+            audio_with_offset = temp_path / "audio_with_offset.wav"
+            add_silence_to_audio_ffmpeg(str(temp_audio), str(audio_with_offset), 1.0 / fps + silence_duration)
+            # Save video
+            save_video_as_grid_and_mp4(
+                video,
+                str(output_dir),
+                fps,
+                prompt=text_prompt,
+                audio_path=str(audio_with_offset) if args_global.use_audio else None,
+                prefix=f'result_000'
+            )
+            progress(0.9, desc="Finalizing")
             # Find the generated video file
+            generated_videos = list(output_dir.glob("result_000_*.mp4"))
+            if not generated_videos:
+                # Also check for result_000.mp4 (without suffix)
+                generated_videos = list(output_dir.glob("result_000.mp4"))
             if not generated_videos:
                 raise gr.Error("No video file was generated")
+            # Get the first (and should be only) video
+            latest_video = generated_videos[0]
+            # Create a persistent temporary file for Gradio
             with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp_output:
                 output_path = tmp_output.name
+            # Copy the generated video to the persistent temp file
             shutil.copy(latest_video, output_path)
             progress(1.0, desc="Generation complete")
+            logger.info(f"Video saved to: {output_path}")
+            return output_path, seed
     except Exception as e:
+        logger.error(f"Error generating video: {str(e)}", exc_info=True)
         raise gr.Error(f"Error generating video: {str(e)}")
+# Initialize models on module import (for Hugging Face Spaces)
+logger.info("Initializing OmniAvatar...")
+logger.info("Checking and downloading required models...")
+download_models()
+logger.info("Model initialization complete")
 # Create the Gradio interface
 with gr.Blocks(title="OmniAvatar - Lipsynced Avatar Video Generation") as app:
     gr.Markdown("""
             with gr.Accordion("Advanced Settings", open=False):
                 with gr.Row():
+                    use_random_seed = gr.Checkbox(
+                        label="Use random seed",
+                        value=True
+                    )
                     seed = gr.Slider(
+                        label="Seed (ignored if random seed is checked)",
+                        minimum=0,
                         maximum=2147483647,
                         step=1,
+                        value=42
                     )
                     resolution = gr.Radio(
                 "🎬 Generate Avatar Video",
                 variant="primary"
             )
+            # Add seed output display
+            seed_output = gr.Number(
+                label="Seed used",
+                interactive=False
+            )
         with gr.Column(scale=1):
             # Output component
             audio_file,
             text_prompt,
             seed,
+            use_random_seed,
             num_steps,
             guidance_scale,
             audio_scale,
             silence_duration,
             resolution
         ],
+        outputs=[output_video, seed_output]
     )
     gr.Markdown("""
 # Launch the app
 if __name__ == "__main__":
     app.launch(share=True)