Spaces:

jmanhype
/

MuseV

Runtime error

App Files Files Community

jmanhype commited on Dec 5, 2024

Commit

8dc54dd

1 Parent(s): 99e8bea

Update Dockerfile and app for Hugging Face Space deployment

Browse files

Files changed (2) hide show

scripts/gradio/Dockerfile +46 -13
scripts/gradio/app_gradio_space.py +26 -346

scripts/gradio/Dockerfile CHANGED Viewed

@@ -1,17 +1,50 @@
-FROM ubuntu:22.04
-# This is a test Dockerfile to see if it's actually being used
-RUN echo "TESTING IF THIS DOCKERFILE IS ACTUALLY USED" > /test_file.txt && \
-    echo "If you see this message, the correct Dockerfile is being used!" && \
-    cat /test_file.txt
-# Create a very obvious test directory
-RUN mkdir -p /THIS_IS_A_TEST_DIRECTORY && \
-    echo "TEST CONTENT" > /THIS_IS_A_TEST_DIRECTORY/test.txt
-# Print test message during build
-RUN echo "=================================================" && \
-    echo "THIS IS A TEST BUILD - IF YOU SEE THIS, THE DOCKERFILE IS BEING USED" && \
-    echo "================================================="
-CMD ["cat", "/test_file.txt"]

+FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
+# Install basic dependencies
+RUN apt-get update && apt-get install -y \
+    wget \
+    git \
+    python3 \
+    python3-pip \
+    ffmpeg \
+    libsm6 \
+    libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+# Create non-root user
+RUN useradd -m -s /bin/bash huggingface
+WORKDIR /home/huggingface
+# Set up git config
+RUN git config --global user.email "[email protected]" && \
+    git config --global user.name "jmanhype"
+# Copy application code
+COPY . /home/huggingface/app/
+WORKDIR /home/huggingface/app
+# Install Python dependencies
+RUN pip3 install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+RUN pip3 install --no-cache-dir -r requirements.txt
+RUN pip3 install --no-cache-dir gradio spaces
+# Install additional dependencies for controlnet_aux
+RUN pip3 install --no-cache-dir openmim && \
+    mim install mmcv>=2.0.1 && \
+    mim install mmdet>=3.1.0 && \
+    mim install mmpose>=1.1.0
+# Set Python path
+ENV PYTHONPATH=/home/huggingface/app:/home/huggingface/app/MMCM:/home/huggingface/app/diffusers/src:/home/huggingface/app/controlnet_aux/src
+# Set ownership
+RUN chown -R huggingface:huggingface /home/huggingface
+# Switch to non-root user
+USER huggingface
+# Expose port
+EXPOSE 7860
+# Run the app
+CMD ["python3", "scripts/gradio/app_gradio_space.py"]

scripts/gradio/app_gradio_space.py CHANGED Viewed

@@ -1,52 +1,25 @@
 import os
 import time
-import pdb
 import cuid
 import gradio as gr
 import spaces
 import numpy as np
-import sys
 from huggingface_hub import snapshot_download
-import subprocess
 ProjectDir = os.path.abspath(os.path.dirname(__file__))
-CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
 sys.path.insert(0, ProjectDir)
-sys.path.insert(0, f"{ProjectDir}/MMCM")
-sys.path.insert(0, f"{ProjectDir}/diffusers/src")
-sys.path.insert(0, f"{ProjectDir}/controlnet_aux/src")
-sys.path.insert(0, f"{ProjectDir}/scripts/gradio")
-# Install dependencies first
-def install_dependencies():
-    dependencies = [
-        "openmim",
-        "mmengine",
-        "mmcv>=2.0.1",
-        "mmdet>=3.1.0",
-        "mmpose>=1.1.0"
-    ]
-    for dep in dependencies:
-        try:
-            subprocess.run(
-                ["pip", "install", "--no-cache-dir", "-U", dep],
-                check=True,
-                capture_output=True,
-                text=True
-            )
-        except subprocess.CalledProcessError as e:
-            print(f"Warning: Failed to install {dep}: {e.stderr}")
-install_dependencies()
 ignore_video2video = True
 max_image_edge = 960
 def download_model():
     if not os.path.exists(CheckpointsDir):
         print("Checkpoint Not Downloaded, start downloading...")
@@ -62,13 +35,13 @@ def download_model():
     else:
         print("Already download the model.")
-download_model()  # for huggingface deployment.
-if not ignore_video2video:
-    from gradio_video2video import online_v2v_inference
 from gradio_text2video import online_t2v_inference
 @spaces.GPU(duration=180)
 def hf_online_t2v_inference(
     prompt,
@@ -80,164 +53,16 @@ def hf_online_t2v_inference(
     video_len,
     img_edge_ratio,
 ):
-    img_edge_ratio, _, _ = limit_shape(
-        image_np, w, h, img_edge_ratio, max_image_edge=max_image_edge
-    )
     if not isinstance(image_np, np.ndarray):  # None
         raise gr.Error("Need input reference image")
     return online_t2v_inference(
         prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
     )
-@spaces.GPU(duration=180)
-def hg_online_v2v_inference(
-    prompt,
-    image_np,
-    video,
-    processor,
-    seed,
-    fps,
-    w,
-    h,
-    video_length,
-    img_edge_ratio,
-):
-    img_edge_ratio, _, _ = limit_shape(
-        image_np, w, h, img_edge_ratio, max_image_edge=max_image_edge
-    )
-    if not isinstance(image_np, np.ndarray):  # None
-        raise gr.Error("Need input reference image")
-    return online_v2v_inference(
-        prompt,
-        image_np,
-        video,
-        processor,
-        seed,
-        fps,
-        w,
-        h,
-        video_length,
-        img_edge_ratio,
-    )
-def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=max_image_edge):
-    """limite generation video shape to avoid gpu memory overflow"""
-    if input_h == -1 and input_w == -1:
-        if isinstance(image, np.ndarray):
-            input_h, input_w, _ = image.shape
-        elif isinstance(image, PIL.Image.Image):
-            input_w, input_h = image.size
-        else:
-            raise ValueError(
-                f"image should be in [image, ndarray], but given {type(image)}"
-            )
-    if img_edge_ratio == 0:
-        img_edge_ratio = 1
-    img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
-    # print(
-    #     image.shape,
-    #     input_w,
-    #     input_h,
-    #     img_edge_ratio,
-    #     max_image_edge,
-    #     img_edge_ratio_infact,
-    # )
-    if img_edge_ratio != 1:
-        return (
-            img_edge_ratio_infact,
-            input_w * img_edge_ratio_infact,
-            input_h * img_edge_ratio_infact,
-        )
-    else:
-        return img_edge_ratio_infact, -1, -1
-def limit_length(length):
-    """limite generation video frames numer to avoid gpu memory overflow"""
-    if length > 24 * 6:
-        gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
-        length = 24 * 6
-    return length
-class ConcatenateBlock(gr.blocks.Block):
-    def __init__(self, options):
-        self.options = options
-        self.current_string = ""
-    def update_string(self, new_choice):
-        if new_choice and new_choice not in self.current_string.split(", "):
-            if self.current_string == "":
-                self.current_string = new_choice
-            else:
-                self.current_string += ", " + new_choice
-        return self.current_string
-def process_input(new_choice):
-    return concatenate_block.update_string(new_choice), ""
-control_options = [
-    "pose",
-    "pose_body",
-    "pose_hand",
-    "pose_face",
-    "pose_hand_body",
-    "pose_hand_face",
-    "dwpose",
-    "dwpose_face",
-    "dwpose_hand",
-    "dwpose_body",
-    "dwpose_body_hand",
-    "canny",
-    "tile",
-    "hed",
-    "hed_scribble",
-    "depth",
-    "pidi",
-    "normal_bae",
-    "lineart",
-    "lineart_anime",
-    "zoe",
-    "sam",
-    "mobile_sam",
-    "leres",
-    "content",
-    "face_detector",
-]
-concatenate_block = ConcatenateBlock(control_options)
-css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
-with gr.Blocks(css=css) as demo:
-    gr.Markdown(
-        "<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
-                    <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
-                    </br>\
-                    Zhiqiang Xia <sup>*</sup>,\
-                    Zhaokang Chen<sup>*</sup>,\
-                    Bin Wu<sup>†</sup>,\
-                    Chao Li,\
-                    Kwok-Wai Hung,\
-                    Chao Zhan,\
-                    Yingjie He,\
-                    Wenjiang Zhou\
-                    (<sup>*</sup>Equal Contribution,  <sup>†</sup>Corresponding Author, [email protected])\
-                    </br>\
-                    Lyra Lab, Tencent Music Entertainment\
-                </h2> \
-                <a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
-                <a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
-                <a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
-                <a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
-                <a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
-    )
     with gr.Tab("Text to Video"):
         with gr.Row():
             with gr.Column():
@@ -248,62 +73,18 @@ with gr.Blocks(css=css) as demo:
                     value=-1,
                 )
                 video_length = gr.Number(
-                    label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
                     value=12,
                 )
                 fps = gr.Number(label="Generate Video FPS", value=6)
-                gr.Markdown(
-                    (
-                        "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
-                        "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
-                        "The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
-                        "Due to the GPU VRAM limits, the W&H need smaller than 960px"
-                    )
-                )
                 with gr.Row():
                     w = gr.Number(label="Width", value=-1)
                     h = gr.Number(label="Height", value=-1)
                     img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
-                with gr.Row():
-                    out_w = gr.Number(label="Output Width", value=0, interactive=False)
-                    out_h = gr.Number(label="Output Height", value=0, interactive=False)
-                    img_edge_ratio_infact = gr.Number(
-                        label="img_edge_ratio in fact",
-                        value=1.0,
-                        interactive=False,
-                    )
-                btn1 = gr.Button("Generate")
-            out = gr.Video()
-            # pdb.set_trace()
-        i2v_examples_256 = [
-            [
-                "(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
-                "../../data/images/yongen.jpeg",
-            ],
-            [
-                "(masterpiece, best quality, highres:1), peaceful beautiful sea scene",
-                "../../data/images/seaside4.jpeg",
-            ],
-        ]
-        with gr.Row():
-            gr.Examples(
-                examples=i2v_examples_256,
-                inputs=[prompt, image],
-                outputs=[out],
-                fn=hf_online_t2v_inference,
-                cache_examples=False,
-            )
-        img_edge_ratio.change(
-            fn=limit_shape,
-            inputs=[image, w, h, img_edge_ratio],
-            outputs=[img_edge_ratio_infact, out_w, out_h],
-        )
-        video_length.change(
-            fn=limit_length, inputs=[video_length], outputs=[video_length]
-        )
-        btn1.click(
             fn=hf_online_t2v_inference,
             inputs=[
                 prompt,
@@ -313,116 +94,15 @@ with gr.Blocks(css=css) as demo:
                 w,
                 h,
                 video_length,
-                img_edge_ratio_infact,
             ],
-            outputs=out,
         )
     with gr.Tab("Video to Video"):
-        if ignore_video2video:
-            gr.Markdown(
-                (
-                    "Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
-                    "We are trying to support video2video in the future. Thanks for your understanding."
-                )
-            )
-        else:
-            with gr.Row():
-                with gr.Column():
-                    prompt = gr.Textbox(label="Prompt")
-                    gr.Markdown(
-                        (
-                            "pose of VisionCondImage should be same as of the first frame of the video. "
-                            "its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
-                        )
-                    )
-                    image = gr.Image(label="VisionCondImage")
-                    video = gr.Video(label="ReferVideo")
-                    # radio = gr.inputs.Radio(, label="Select an option")
-                    # ctr_button = gr.inputs.Button(label="Add ControlNet List")
-                    # output_text = gr.outputs.Textbox()
-                    processor = gr.Textbox(
-                        label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
-                        value="dwpose_body_hand",
-                    )
-                    gr.Markdown("seed=-1 means that seeds are different in every run")
-                    seed = gr.Number(
-                        label="Seed (seed=-1 means that the seeds run each time are different)",
-                        value=-1,
-                    )
-                    video_length = gr.Number(label="Video Length", value=12)
-                    fps = gr.Number(label="Generate Video FPS", value=6)
-                    gr.Markdown(
-                        (
-                            "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
-                            "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
-                            "The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
-                            "Due to the GPU VRAM limits, the W&H need smaller than 2000px"
-                        )
-                    )
-                    with gr.Row():
-                        w = gr.Number(label="Width", value=-1)
-                        h = gr.Number(label="Height", value=-1)
-                        img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
-                    with gr.Row():
-                        out_w = gr.Number(label="Width", value=0, interactive=False)
-                        out_h = gr.Number(label="Height", value=0, interactive=False)
-                        img_edge_ratio_infact = gr.Number(
-                            label="img_edge_ratio in fact",
-                            value=1.0,
-                            interactive=False,
-                        )
-                    btn2 = gr.Button("Generate")
-                out1 = gr.Video()
-            v2v_examples_256 = [
-                [
-                    "(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
-                    "../../data/demo/cyber_girl.png",
-                    "../../data/demo/video1.mp4",
-                ],
-            ]
-            with gr.Row():
-                gr.Examples(
-                    examples=v2v_examples_256,
-                    inputs=[prompt, image, video],
-                    outputs=[out],
-                    fn=hg_online_v2v_inference,
-                    cache_examples=False,
-                )
-            img_edge_ratio.change(
-                fn=limit_shape,
-                inputs=[image, w, h, img_edge_ratio],
-                outputs=[img_edge_ratio_infact, out_w, out_h],
-            )
-            video_length.change(
-                fn=limit_length, inputs=[video_length], outputs=[video_length]
-            )
-            btn2.click(
-                fn=hg_online_v2v_inference,
-                inputs=[
-                    prompt,
-                    image,
-                    video,
-                    processor,
-                    seed,
-                    fps,
-                    w,
-                    h,
-                    video_length,
-                    img_edge_ratio_infact,
-                ],
-                outputs=out1,
-            )
-# Set the IP and port
-ip_address = "0.0.0.0"  # Replace with your desired IP address
-port_number = 7860  # Replace with your desired port number
-demo.queue().launch(
-    share=True, debug=True, server_name=ip_address, server_port=port_number
-)

 import os
 import time
+import sys
 import cuid
 import gradio as gr
 import spaces
 import numpy as np
 from huggingface_hub import snapshot_download
+# Add necessary paths
 ProjectDir = os.path.abspath(os.path.dirname(__file__))
 sys.path.insert(0, ProjectDir)
+sys.path.insert(0, os.path.join(ProjectDir, "MMCM"))
+sys.path.insert(0, os.path.join(ProjectDir, "diffusers/src"))
+sys.path.insert(0, os.path.join(ProjectDir, "controlnet_aux/src"))
+CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
 ignore_video2video = True
 max_image_edge = 960
 def download_model():
     if not os.path.exists(CheckpointsDir):
         print("Checkpoint Not Downloaded, start downloading...")
     else:
         print("Already download the model.")
+# Download model first
+print("Starting model download...")
+download_model()
+# Import after model download to ensure all dependencies are ready
 from gradio_text2video import online_t2v_inference
 @spaces.GPU(duration=180)
 def hf_online_t2v_inference(
     prompt,
     video_len,
     img_edge_ratio,
 ):
     if not isinstance(image_np, np.ndarray):  # None
         raise gr.Error("Need input reference image")
     return online_t2v_inference(
         prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
     )
+# Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# MuseV Demo")
     with gr.Tab("Text to Video"):
         with gr.Row():
             with gr.Column():
                     value=-1,
                 )
                 video_length = gr.Number(
+                    label="Video Length(need smaller than 144)",
                     value=12,
                 )
                 fps = gr.Number(label="Generate Video FPS", value=6)
                 with gr.Row():
                     w = gr.Number(label="Width", value=-1)
                     h = gr.Number(label="Height", value=-1)
                     img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
+                btn = gr.Button("Generate")
+            video_output = gr.Video()
+        btn.click(
             fn=hf_online_t2v_inference,
             inputs=[
                 prompt,
                 w,
                 h,
                 video_length,
+                img_edge_ratio,
             ],
+            outputs=video_output,
         )
     with gr.Tab("Video to Video"):
+        gr.Markdown(
+            "Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally."
+        )
+# Launch the app
+demo.queue().launch(server_name="0.0.0.0", server_port=7860)