jmanhype commited on
Commit
8dc54dd
Β·
1 Parent(s): 99e8bea

Update Dockerfile and app for Hugging Face Space deployment

Browse files
scripts/gradio/Dockerfile CHANGED
@@ -1,17 +1,50 @@
1
- FROM ubuntu:22.04
2
 
3
- # This is a test Dockerfile to see if it's actually being used
4
- RUN echo "TESTING IF THIS DOCKERFILE IS ACTUALLY USED" > /test_file.txt && \
5
- echo "If you see this message, the correct Dockerfile is being used!" && \
6
- cat /test_file.txt
 
 
 
 
 
 
7
 
8
- # Create a very obvious test directory
9
- RUN mkdir -p /THIS_IS_A_TEST_DIRECTORY && \
10
- echo "TEST CONTENT" > /THIS_IS_A_TEST_DIRECTORY/test.txt
11
 
12
- # Print test message during build
13
- RUN echo "=================================================" && \
14
- echo "THIS IS A TEST BUILD - IF YOU SEE THIS, THE DOCKERFILE IS BEING USED" && \
15
- echo "================================================="
16
 
17
- CMD ["cat", "/test_file.txt"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu22.04
2
 
3
+ # Install basic dependencies
4
+ RUN apt-get update && apt-get install -y \
5
+ wget \
6
+ git \
7
+ python3 \
8
+ python3-pip \
9
+ ffmpeg \
10
+ libsm6 \
11
+ libxext6 \
12
+ && rm -rf /var/lib/apt/lists/*
13
 
14
+ # Create non-root user
15
+ RUN useradd -m -s /bin/bash huggingface
16
+ WORKDIR /home/huggingface
17
 
18
+ # Set up git config
19
+ RUN git config --global user.email "[email protected]" && \
20
+ git config --global user.name "jmanhype"
 
21
 
22
+ # Copy application code
23
+ COPY . /home/huggingface/app/
24
+ WORKDIR /home/huggingface/app
25
+
26
+ # Install Python dependencies
27
+ RUN pip3 install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
28
+ RUN pip3 install --no-cache-dir -r requirements.txt
29
+ RUN pip3 install --no-cache-dir gradio spaces
30
+
31
+ # Install additional dependencies for controlnet_aux
32
+ RUN pip3 install --no-cache-dir openmim && \
33
+ mim install mmcv>=2.0.1 && \
34
+ mim install mmdet>=3.1.0 && \
35
+ mim install mmpose>=1.1.0
36
+
37
+ # Set Python path
38
+ ENV PYTHONPATH=/home/huggingface/app:/home/huggingface/app/MMCM:/home/huggingface/app/diffusers/src:/home/huggingface/app/controlnet_aux/src
39
+
40
+ # Set ownership
41
+ RUN chown -R huggingface:huggingface /home/huggingface
42
+
43
+ # Switch to non-root user
44
+ USER huggingface
45
+
46
+ # Expose port
47
+ EXPOSE 7860
48
+
49
+ # Run the app
50
+ CMD ["python3", "scripts/gradio/app_gradio_space.py"]
scripts/gradio/app_gradio_space.py CHANGED
@@ -1,52 +1,25 @@
1
  import os
2
  import time
3
- import pdb
4
 
5
  import cuid
6
  import gradio as gr
7
  import spaces
8
  import numpy as np
9
- import sys
10
 
11
  from huggingface_hub import snapshot_download
12
- import subprocess
13
-
14
 
 
15
  ProjectDir = os.path.abspath(os.path.dirname(__file__))
16
- CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
17
-
18
  sys.path.insert(0, ProjectDir)
19
- sys.path.insert(0, f"{ProjectDir}/MMCM")
20
- sys.path.insert(0, f"{ProjectDir}/diffusers/src")
21
- sys.path.insert(0, f"{ProjectDir}/controlnet_aux/src")
22
- sys.path.insert(0, f"{ProjectDir}/scripts/gradio")
23
-
24
- # Install dependencies first
25
- def install_dependencies():
26
- dependencies = [
27
- "openmim",
28
- "mmengine",
29
- "mmcv>=2.0.1",
30
- "mmdet>=3.1.0",
31
- "mmpose>=1.1.0"
32
- ]
33
- for dep in dependencies:
34
- try:
35
- subprocess.run(
36
- ["pip", "install", "--no-cache-dir", "-U", dep],
37
- check=True,
38
- capture_output=True,
39
- text=True
40
- )
41
- except subprocess.CalledProcessError as e:
42
- print(f"Warning: Failed to install {dep}: {e.stderr}")
43
-
44
- install_dependencies()
45
 
 
46
  ignore_video2video = True
47
  max_image_edge = 960
48
 
49
-
50
  def download_model():
51
  if not os.path.exists(CheckpointsDir):
52
  print("Checkpoint Not Downloaded, start downloading...")
@@ -62,13 +35,13 @@ def download_model():
62
  else:
63
  print("Already download the model.")
64
 
 
 
 
65
 
66
- download_model() # for huggingface deployment.
67
- if not ignore_video2video:
68
- from gradio_video2video import online_v2v_inference
69
  from gradio_text2video import online_t2v_inference
70
 
71
-
72
  @spaces.GPU(duration=180)
73
  def hf_online_t2v_inference(
74
  prompt,
@@ -80,164 +53,16 @@ def hf_online_t2v_inference(
80
  video_len,
81
  img_edge_ratio,
82
  ):
83
- img_edge_ratio, _, _ = limit_shape(
84
- image_np, w, h, img_edge_ratio, max_image_edge=max_image_edge
85
- )
86
  if not isinstance(image_np, np.ndarray): # None
87
  raise gr.Error("Need input reference image")
88
  return online_t2v_inference(
89
  prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
90
  )
91
 
92
-
93
- @spaces.GPU(duration=180)
94
- def hg_online_v2v_inference(
95
- prompt,
96
- image_np,
97
- video,
98
- processor,
99
- seed,
100
- fps,
101
- w,
102
- h,
103
- video_length,
104
- img_edge_ratio,
105
- ):
106
- img_edge_ratio, _, _ = limit_shape(
107
- image_np, w, h, img_edge_ratio, max_image_edge=max_image_edge
108
- )
109
- if not isinstance(image_np, np.ndarray): # None
110
- raise gr.Error("Need input reference image")
111
- return online_v2v_inference(
112
- prompt,
113
- image_np,
114
- video,
115
- processor,
116
- seed,
117
- fps,
118
- w,
119
- h,
120
- video_length,
121
- img_edge_ratio,
122
- )
123
-
124
-
125
- def limit_shape(image, input_w, input_h, img_edge_ratio, max_image_edge=max_image_edge):
126
- """limite generation video shape to avoid gpu memory overflow"""
127
- if input_h == -1 and input_w == -1:
128
- if isinstance(image, np.ndarray):
129
- input_h, input_w, _ = image.shape
130
- elif isinstance(image, PIL.Image.Image):
131
- input_w, input_h = image.size
132
- else:
133
- raise ValueError(
134
- f"image should be in [image, ndarray], but given {type(image)}"
135
- )
136
- if img_edge_ratio == 0:
137
- img_edge_ratio = 1
138
- img_edge_ratio_infact = min(max_image_edge / max(input_h, input_w), img_edge_ratio)
139
- # print(
140
- # image.shape,
141
- # input_w,
142
- # input_h,
143
- # img_edge_ratio,
144
- # max_image_edge,
145
- # img_edge_ratio_infact,
146
- # )
147
- if img_edge_ratio != 1:
148
- return (
149
- img_edge_ratio_infact,
150
- input_w * img_edge_ratio_infact,
151
- input_h * img_edge_ratio_infact,
152
- )
153
- else:
154
- return img_edge_ratio_infact, -1, -1
155
-
156
-
157
- def limit_length(length):
158
- """limite generation video frames numer to avoid gpu memory overflow"""
159
-
160
- if length > 24 * 6:
161
- gr.Warning("Length need to smaller than 144, dute to gpu memory limit")
162
- length = 24 * 6
163
- return length
164
-
165
-
166
- class ConcatenateBlock(gr.blocks.Block):
167
- def __init__(self, options):
168
- self.options = options
169
- self.current_string = ""
170
-
171
- def update_string(self, new_choice):
172
- if new_choice and new_choice not in self.current_string.split(", "):
173
- if self.current_string == "":
174
- self.current_string = new_choice
175
- else:
176
- self.current_string += ", " + new_choice
177
- return self.current_string
178
-
179
-
180
- def process_input(new_choice):
181
- return concatenate_block.update_string(new_choice), ""
182
-
183
-
184
- control_options = [
185
- "pose",
186
- "pose_body",
187
- "pose_hand",
188
- "pose_face",
189
- "pose_hand_body",
190
- "pose_hand_face",
191
- "dwpose",
192
- "dwpose_face",
193
- "dwpose_hand",
194
- "dwpose_body",
195
- "dwpose_body_hand",
196
- "canny",
197
- "tile",
198
- "hed",
199
- "hed_scribble",
200
- "depth",
201
- "pidi",
202
- "normal_bae",
203
- "lineart",
204
- "lineart_anime",
205
- "zoe",
206
- "sam",
207
- "mobile_sam",
208
- "leres",
209
- "content",
210
- "face_detector",
211
- ]
212
- concatenate_block = ConcatenateBlock(control_options)
213
-
214
-
215
- css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}"""
216
-
217
-
218
- with gr.Blocks(css=css) as demo:
219
- gr.Markdown(
220
- "<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \
221
- <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
222
- </br>\
223
- Zhiqiang Xia <sup>*</sup>,\
224
- Zhaokang Chen<sup>*</sup>,\
225
- Bin Wu<sup>†</sup>,\
226
- Chao Li,\
227
- Kwok-Wai Hung,\
228
- Chao Zhan,\
229
- Yingjie He,\
230
- Wenjiang Zhou\
231
- (<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, [email protected])\
232
- </br>\
233
- Lyra Lab, Tencent Music Entertainment\
234
- </h2> \
235
- <a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\
236
- <a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
237
- <a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\
238
- <a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \
239
- <a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>"
240
- )
241
  with gr.Tab("Text to Video"):
242
  with gr.Row():
243
  with gr.Column():
@@ -248,62 +73,18 @@ with gr.Blocks(css=css) as demo:
248
  value=-1,
249
  )
250
  video_length = gr.Number(
251
- label="Video Length(need smaller than 144,If you want to be able to generate longer videos, run it locally )",
252
  value=12,
253
  )
254
  fps = gr.Number(label="Generate Video FPS", value=6)
255
- gr.Markdown(
256
- (
257
- "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
258
- "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
259
- "The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
260
- "Due to the GPU VRAM limits, the W&H need smaller than 960px"
261
- )
262
- )
263
  with gr.Row():
264
  w = gr.Number(label="Width", value=-1)
265
  h = gr.Number(label="Height", value=-1)
266
  img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
267
- with gr.Row():
268
- out_w = gr.Number(label="Output Width", value=0, interactive=False)
269
- out_h = gr.Number(label="Output Height", value=0, interactive=False)
270
- img_edge_ratio_infact = gr.Number(
271
- label="img_edge_ratio in fact",
272
- value=1.0,
273
- interactive=False,
274
- )
275
- btn1 = gr.Button("Generate")
276
- out = gr.Video()
277
- # pdb.set_trace()
278
- i2v_examples_256 = [
279
- [
280
- "(masterpiece, best quality, highres:1),(1boy, solo:1),(eye blinks:1.8),(head wave:1.3)",
281
- "../../data/images/yongen.jpeg",
282
- ],
283
- [
284
- "(masterpiece, best quality, highres:1), peaceful beautiful sea scene",
285
- "../../data/images/seaside4.jpeg",
286
- ],
287
- ]
288
- with gr.Row():
289
- gr.Examples(
290
- examples=i2v_examples_256,
291
- inputs=[prompt, image],
292
- outputs=[out],
293
- fn=hf_online_t2v_inference,
294
- cache_examples=False,
295
- )
296
- img_edge_ratio.change(
297
- fn=limit_shape,
298
- inputs=[image, w, h, img_edge_ratio],
299
- outputs=[img_edge_ratio_infact, out_w, out_h],
300
- )
301
-
302
- video_length.change(
303
- fn=limit_length, inputs=[video_length], outputs=[video_length]
304
- )
305
-
306
- btn1.click(
307
  fn=hf_online_t2v_inference,
308
  inputs=[
309
  prompt,
@@ -313,116 +94,15 @@ with gr.Blocks(css=css) as demo:
313
  w,
314
  h,
315
  video_length,
316
- img_edge_ratio_infact,
317
  ],
318
- outputs=out,
319
  )
320
 
321
  with gr.Tab("Video to Video"):
322
- if ignore_video2video:
323
- gr.Markdown(
324
- (
325
- "Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally. \n"
326
- "We are trying to support video2video in the future. Thanks for your understanding."
327
- )
328
- )
329
- else:
330
- with gr.Row():
331
- with gr.Column():
332
- prompt = gr.Textbox(label="Prompt")
333
- gr.Markdown(
334
- (
335
- "pose of VisionCondImage should be same as of the first frame of the video. "
336
- "its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL."
337
- )
338
- )
339
- image = gr.Image(label="VisionCondImage")
340
- video = gr.Video(label="ReferVideo")
341
- # radio = gr.inputs.Radio(, label="Select an option")
342
- # ctr_button = gr.inputs.Button(label="Add ControlNet List")
343
- # output_text = gr.outputs.Textbox()
344
- processor = gr.Textbox(
345
- label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}",
346
- value="dwpose_body_hand",
347
- )
348
- gr.Markdown("seed=-1 means that seeds are different in every run")
349
- seed = gr.Number(
350
- label="Seed (seed=-1 means that the seeds run each time are different)",
351
- value=-1,
352
- )
353
- video_length = gr.Number(label="Video Length", value=12)
354
- fps = gr.Number(label="Generate Video FPS", value=6)
355
- gr.Markdown(
356
- (
357
- "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n"
358
- "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n"
359
- "The longer the W&H, the smaller the motion amplitude, and the higher video quality.\n"
360
- "Due to the GPU VRAM limits, the W&H need smaller than 2000px"
361
- )
362
- )
363
- with gr.Row():
364
- w = gr.Number(label="Width", value=-1)
365
- h = gr.Number(label="Height", value=-1)
366
- img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
367
-
368
- with gr.Row():
369
- out_w = gr.Number(label="Width", value=0, interactive=False)
370
- out_h = gr.Number(label="Height", value=0, interactive=False)
371
- img_edge_ratio_infact = gr.Number(
372
- label="img_edge_ratio in fact",
373
- value=1.0,
374
- interactive=False,
375
- )
376
- btn2 = gr.Button("Generate")
377
- out1 = gr.Video()
378
-
379
- v2v_examples_256 = [
380
- [
381
- "(masterpiece, best quality, highres:1), harley quinn is dancing, animation, by joshua klein",
382
- "../../data/demo/cyber_girl.png",
383
- "../../data/demo/video1.mp4",
384
- ],
385
- ]
386
- with gr.Row():
387
- gr.Examples(
388
- examples=v2v_examples_256,
389
- inputs=[prompt, image, video],
390
- outputs=[out],
391
- fn=hg_online_v2v_inference,
392
- cache_examples=False,
393
- )
394
-
395
- img_edge_ratio.change(
396
- fn=limit_shape,
397
- inputs=[image, w, h, img_edge_ratio],
398
- outputs=[img_edge_ratio_infact, out_w, out_h],
399
- )
400
- video_length.change(
401
- fn=limit_length, inputs=[video_length], outputs=[video_length]
402
- )
403
- btn2.click(
404
- fn=hg_online_v2v_inference,
405
- inputs=[
406
- prompt,
407
- image,
408
- video,
409
- processor,
410
- seed,
411
- fps,
412
- w,
413
- h,
414
- video_length,
415
- img_edge_ratio_infact,
416
- ],
417
- outputs=out1,
418
- )
419
-
420
-
421
- # Set the IP and port
422
- ip_address = "0.0.0.0" # Replace with your desired IP address
423
- port_number = 7860 # Replace with your desired port number
424
-
425
 
426
- demo.queue().launch(
427
- share=True, debug=True, server_name=ip_address, server_port=port_number
428
- )
 
1
  import os
2
  import time
3
+ import sys
4
 
5
  import cuid
6
  import gradio as gr
7
  import spaces
8
  import numpy as np
 
9
 
10
  from huggingface_hub import snapshot_download
 
 
11
 
12
+ # Add necessary paths
13
  ProjectDir = os.path.abspath(os.path.dirname(__file__))
 
 
14
  sys.path.insert(0, ProjectDir)
15
+ sys.path.insert(0, os.path.join(ProjectDir, "MMCM"))
16
+ sys.path.insert(0, os.path.join(ProjectDir, "diffusers/src"))
17
+ sys.path.insert(0, os.path.join(ProjectDir, "controlnet_aux/src"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ CheckpointsDir = os.path.join(ProjectDir, "checkpoints")
20
  ignore_video2video = True
21
  max_image_edge = 960
22
 
 
23
  def download_model():
24
  if not os.path.exists(CheckpointsDir):
25
  print("Checkpoint Not Downloaded, start downloading...")
 
35
  else:
36
  print("Already download the model.")
37
 
38
+ # Download model first
39
+ print("Starting model download...")
40
+ download_model()
41
 
42
+ # Import after model download to ensure all dependencies are ready
 
 
43
  from gradio_text2video import online_t2v_inference
44
 
 
45
  @spaces.GPU(duration=180)
46
  def hf_online_t2v_inference(
47
  prompt,
 
53
  video_len,
54
  img_edge_ratio,
55
  ):
 
 
 
56
  if not isinstance(image_np, np.ndarray): # None
57
  raise gr.Error("Need input reference image")
58
  return online_t2v_inference(
59
  prompt, image_np, seed, fps, w, h, video_len, img_edge_ratio
60
  )
61
 
62
+ # Create Gradio interface
63
+ with gr.Blocks() as demo:
64
+ gr.Markdown("# MuseV Demo")
65
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  with gr.Tab("Text to Video"):
67
  with gr.Row():
68
  with gr.Column():
 
73
  value=-1,
74
  )
75
  video_length = gr.Number(
76
+ label="Video Length(need smaller than 144)",
77
  value=12,
78
  )
79
  fps = gr.Number(label="Generate Video FPS", value=6)
 
 
 
 
 
 
 
 
80
  with gr.Row():
81
  w = gr.Number(label="Width", value=-1)
82
  h = gr.Number(label="Height", value=-1)
83
  img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0)
84
+ btn = gr.Button("Generate")
85
+ video_output = gr.Video()
86
+
87
+ btn.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  fn=hf_online_t2v_inference,
89
  inputs=[
90
  prompt,
 
94
  w,
95
  h,
96
  video_length,
97
+ img_edge_ratio,
98
  ],
99
+ outputs=video_output,
100
  )
101
 
102
  with gr.Tab("Video to Video"):
103
+ gr.Markdown(
104
+ "Due to GPU limit, MuseVDemo now only support Text2Video. If you want to try Video2Video, please run it locally."
105
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
+ # Launch the app
108
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)