Spaces:
				
			
			
	
			
			
		Running
		
			on 
			
			Zero
	
	
	
			
			
	
	
	
	
		
		
		Running
		
			on 
			
			Zero
	update-inference (#4)
Browse files- Update inference.py (7b9f053b980d37b4af43b808e6cfada81caee869)
- Update app.py (9742f923e6f1247a0c029af3dce3613733f5530c)
- app.py +1 -1
- inference.py +11 -15
    	
        app.py
    CHANGED
    
    | @@ -346,7 +346,7 @@ with gr.Blocks(css=css) as demo: | |
| 346 | 
             
                    with gr.Column():
         | 
| 347 | 
             
                        with gr.Tab("image-to-video") as image_tab:
         | 
| 348 | 
             
                            video_i_hidden = gr.Textbox(label="video_i", visible=False, value=None)
         | 
| 349 | 
            -
                            image_i2v = gr.Image(label="Input Image", type="filepath", sources=["upload", "webcam"])
         | 
| 350 | 
             
                            i2v_prompt = gr.Textbox(label="Prompt", value="The creature from the image starts to move", lines=3)
         | 
| 351 | 
             
                            i2v_button = gr.Button("Generate Image-to-Video", variant="primary")
         | 
| 352 | 
             
                        with gr.Tab("text-to-video") as text_tab:
         | 
|  | |
| 346 | 
             
                    with gr.Column():
         | 
| 347 | 
             
                        with gr.Tab("image-to-video") as image_tab:
         | 
| 348 | 
             
                            video_i_hidden = gr.Textbox(label="video_i", visible=False, value=None)
         | 
| 349 | 
            +
                            image_i2v = gr.Image(label="Input Image", type="filepath", sources=["upload", "webcam", "clipboard"])
         | 
| 350 | 
             
                            i2v_prompt = gr.Textbox(label="Prompt", value="The creature from the image starts to move", lines=3)
         | 
| 351 | 
             
                            i2v_button = gr.Button("Generate Image-to-Video", variant="primary")
         | 
| 352 | 
             
                        with gr.Tab("text-to-video") as text_tab:
         | 
    	
        inference.py
    CHANGED
    
    | @@ -11,6 +11,7 @@ import imageio | |
| 11 | 
             
            import json
         | 
| 12 | 
             
            import numpy as np
         | 
| 13 | 
             
            import torch
         | 
|  | |
| 14 | 
             
            from safetensors import safe_open
         | 
| 15 | 
             
            from PIL import Image
         | 
| 16 | 
             
            from transformers import (
         | 
| @@ -35,6 +36,7 @@ from ltx_video.pipelines.pipeline_ltx_video import ( | |
| 35 | 
             
            from ltx_video.schedulers.rf import RectifiedFlowScheduler
         | 
| 36 | 
             
            from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
         | 
| 37 | 
             
            from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
         | 
|  | |
| 38 |  | 
| 39 | 
             
            MAX_HEIGHT = 720
         | 
| 40 | 
             
            MAX_WIDTH = 1280
         | 
| @@ -96,7 +98,12 @@ def load_image_to_tensor_with_resize_and_crop( | |
| 96 | 
             
                image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
         | 
| 97 | 
             
                if not just_crop:
         | 
| 98 | 
             
                    image = image.resize((target_width, target_height))
         | 
| 99 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 100 | 
             
                frame_tensor = (frame_tensor / 127.5) - 1.0
         | 
| 101 | 
             
                # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
         | 
| 102 | 
             
                return frame_tensor.unsqueeze(0).unsqueeze(2)
         | 
| @@ -266,13 +273,6 @@ def main(): | |
| 266 | 
             
                    help="Path to the input video (or imaage) to be modified using the video-to-video pipeline",
         | 
| 267 | 
             
                )
         | 
| 268 |  | 
| 269 | 
            -
                parser.add_argument(
         | 
| 270 | 
            -
                    "--strength",
         | 
| 271 | 
            -
                    type=float,
         | 
| 272 | 
            -
                    default=1.0,
         | 
| 273 | 
            -
                    help="Editing strength (noising level) for video-to-video pipeline.",
         | 
| 274 | 
            -
                )
         | 
| 275 | 
            -
             | 
| 276 | 
             
                # Conditioning arguments
         | 
| 277 | 
             
                parser.add_argument(
         | 
| 278 | 
             
                    "--conditioning_media_paths",
         | 
| @@ -407,7 +407,6 @@ def infer( | |
| 407 | 
             
                negative_prompt: str,
         | 
| 408 | 
             
                offload_to_cpu: bool,
         | 
| 409 | 
             
                input_media_path: Optional[str] = None,
         | 
| 410 | 
            -
                strength: Optional[float] = 1.0,
         | 
| 411 | 
             
                conditioning_media_paths: Optional[List[str]] = None,
         | 
| 412 | 
             
                conditioning_strengths: Optional[List[float]] = None,
         | 
| 413 | 
             
                conditioning_start_frames: Optional[List[int]] = None,
         | 
| @@ -422,12 +421,10 @@ def infer( | |
| 422 |  | 
| 423 | 
             
                models_dir = "MODEL_DIR"
         | 
| 424 |  | 
| 425 | 
            -
                 | 
| 426 | 
            -
                ltxv_model_name_or_path = "ltxv-13b-0.9.7-distilled-rc3.safetensors"
         | 
| 427 | 
             
                if not os.path.isfile(ltxv_model_name_or_path):
         | 
| 428 | 
             
                    ltxv_model_path = hf_hub_download(
         | 
| 429 | 
            -
                        repo_id=" | 
| 430 | 
            -
                        #repo_id="Lightricks/LTX-Video",
         | 
| 431 | 
             
                        filename=ltxv_model_name_or_path,
         | 
| 432 | 
             
                        local_dir=models_dir,
         | 
| 433 | 
             
                        repo_type="model",
         | 
| @@ -616,7 +613,6 @@ def infer( | |
| 616 | 
             
                    frame_rate=frame_rate,
         | 
| 617 | 
             
                    **sample,
         | 
| 618 | 
             
                    media_items=media_item,
         | 
| 619 | 
            -
                    strength=strength,
         | 
| 620 | 
             
                    conditioning_items=conditioning_items,
         | 
| 621 | 
             
                    is_video=True,
         | 
| 622 | 
             
                    vae_per_channel_normalize=True,
         | 
| @@ -775,4 +771,4 @@ def load_media_file( | |
| 775 |  | 
| 776 |  | 
| 777 | 
             
            if __name__ == "__main__":
         | 
| 778 | 
            -
                main()
         | 
|  | |
| 11 | 
             
            import json
         | 
| 12 | 
             
            import numpy as np
         | 
| 13 | 
             
            import torch
         | 
| 14 | 
            +
            import cv2
         | 
| 15 | 
             
            from safetensors import safe_open
         | 
| 16 | 
             
            from PIL import Image
         | 
| 17 | 
             
            from transformers import (
         | 
|  | |
| 36 | 
             
            from ltx_video.schedulers.rf import RectifiedFlowScheduler
         | 
| 37 | 
             
            from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
         | 
| 38 | 
             
            from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
         | 
| 39 | 
            +
            import ltx_video.pipelines.crf_compressor as crf_compressor
         | 
| 40 |  | 
| 41 | 
             
            MAX_HEIGHT = 720
         | 
| 42 | 
             
            MAX_WIDTH = 1280
         | 
|  | |
| 98 | 
             
                image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
         | 
| 99 | 
             
                if not just_crop:
         | 
| 100 | 
             
                    image = image.resize((target_width, target_height))
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                image = np.array(image)
         | 
| 103 | 
            +
                image = cv2.GaussianBlur(image, (3, 3), 0)
         | 
| 104 | 
            +
                frame_tensor = torch.from_numpy(image).float()
         | 
| 105 | 
            +
                frame_tensor = crf_compressor.compress(frame_tensor / 255.0) * 255.0
         | 
| 106 | 
            +
                frame_tensor = frame_tensor.permute(2, 0, 1)
         | 
| 107 | 
             
                frame_tensor = (frame_tensor / 127.5) - 1.0
         | 
| 108 | 
             
                # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
         | 
| 109 | 
             
                return frame_tensor.unsqueeze(0).unsqueeze(2)
         | 
|  | |
| 273 | 
             
                    help="Path to the input video (or imaage) to be modified using the video-to-video pipeline",
         | 
| 274 | 
             
                )
         | 
| 275 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 276 | 
             
                # Conditioning arguments
         | 
| 277 | 
             
                parser.add_argument(
         | 
| 278 | 
             
                    "--conditioning_media_paths",
         | 
|  | |
| 407 | 
             
                negative_prompt: str,
         | 
| 408 | 
             
                offload_to_cpu: bool,
         | 
| 409 | 
             
                input_media_path: Optional[str] = None,
         | 
|  | |
| 410 | 
             
                conditioning_media_paths: Optional[List[str]] = None,
         | 
| 411 | 
             
                conditioning_strengths: Optional[List[float]] = None,
         | 
| 412 | 
             
                conditioning_start_frames: Optional[List[int]] = None,
         | 
|  | |
| 421 |  | 
| 422 | 
             
                models_dir = "MODEL_DIR"
         | 
| 423 |  | 
| 424 | 
            +
                ltxv_model_name_or_path = pipeline_config["checkpoint_path"]
         | 
|  | |
| 425 | 
             
                if not os.path.isfile(ltxv_model_name_or_path):
         | 
| 426 | 
             
                    ltxv_model_path = hf_hub_download(
         | 
| 427 | 
            +
                        repo_id="Lightricks/LTX-Video",
         | 
|  | |
| 428 | 
             
                        filename=ltxv_model_name_or_path,
         | 
| 429 | 
             
                        local_dir=models_dir,
         | 
| 430 | 
             
                        repo_type="model",
         | 
|  | |
| 613 | 
             
                    frame_rate=frame_rate,
         | 
| 614 | 
             
                    **sample,
         | 
| 615 | 
             
                    media_items=media_item,
         | 
|  | |
| 616 | 
             
                    conditioning_items=conditioning_items,
         | 
| 617 | 
             
                    is_video=True,
         | 
| 618 | 
             
                    vae_per_channel_normalize=True,
         | 
|  | |
| 771 |  | 
| 772 |  | 
| 773 | 
             
            if __name__ == "__main__":
         | 
| 774 | 
            +
                main()
         | 
 
			

