diqiu7 commited on
Commit
8f86798
·
verified ·
1 Parent(s): 6e29211

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +60 -24
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  os.system("pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt221/download.html")
3
  import shutil
 
4
 
5
  from huggingface_hub import snapshot_download
6
 
@@ -103,6 +104,22 @@ def save_video_with_audio(video_path, audio_path, save_path):
103
  audio_clip.close()
104
  return save_path
105
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  # Global parameters
107
  model_name = "pretrained_models/SkyReels-A1-5B/"
108
  siglip_name = "pretrained_models/SkyReels-A1-5B/siglip-so400m-patch14-384"
@@ -204,9 +221,13 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
204
  out_frames = processor.preprocess_lmk3d_from_coef(
205
  source_outputs, source_tform, image_original.shape, driving_outputs
206
  )
207
- out_frames = parse_video(out_frames, max_frame_num)
 
 
 
 
208
 
209
- rescale_motions = np.zeros_like(image)[np.newaxis, :].repeat(48, axis=0)
210
  for ii in range(rescale_motions.shape[0]):
211
  rescale_motions[ii][y1:y1+face_h, x1:x1+face_w] = out_frames[ii]
212
 
@@ -222,8 +243,8 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
222
  first_motion[y1:y1+face_h, x1:x1+face_w] = ref_img
223
  first_motion = first_motion[np.newaxis, :]
224
 
225
- motions = np.concatenate([first_motion, rescale_motions])
226
- input_video = motions[:max_frame_num]
227
 
228
  # Face alignment
229
  face_helper.clean_all()
@@ -234,29 +255,44 @@ def process_image_audio(image_path, audio_path, guidance_scale=3.0, steps=10, pr
234
  image_face = align_face[:, :, ::-1]
235
 
236
  # Prepare input video
237
- input_video = torch.from_numpy(np.array(input_video)).permute([3, 0, 1, 2]).unsqueeze(0)
238
- input_video = input_video / 255
239
 
240
  progress(0.6, desc="Generating animation (this may take a while)...")
241
  # Generate video
242
- with torch.no_grad():
243
- sample = pipe(
244
- image=image,
245
- image_face=image_face,
246
- control_video=input_video,
247
- prompt="",
248
- negative_prompt="",
249
- height=480,
250
- width=720,
251
- num_frames=49,
252
- # generator=generator,
253
- guidance_scale=guidance_scale,
254
- num_inference_steps=steps,
255
- )
256
- out_samples = sample.frames[0]
257
-
258
- out_samples = out_samples[2:] # Skip first two frames
259
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
  progress(0.8, desc="Creating output video...")
261
  # Export video
262
  export_to_video(out_samples, temp_video_path, fps=12)
 
1
  import os
2
  os.system("pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu121_pyt221/download.html")
3
  import shutil
4
+ import math
5
 
6
  from huggingface_hub import snapshot_download
7
 
 
104
  audio_clip.close()
105
  return save_path
106
 
107
+ def pad_video(driving_frames, fps=25):
108
+ video_length = len(driving_frames)
109
+
110
+ duration = video_length / fps
111
+ target_times = np.arange(0, duration, 1/12)
112
+ frame_indices = (target_times * fps).astype(np.int32)
113
+
114
+ frame_indices = frame_indices[frame_indices < video_length]
115
+ new_driving_frames = []
116
+ for idx in frame_indices:
117
+ new_driving_frames.append(driving_frames[idx])
118
+
119
+ pad_length = math.ceil(len(new_driving_frames) / 48) * 48 - len(new_driving_frames)
120
+ new_driving_frames.extend([new_driving_frames[-1]]*pad_length)
121
+ return new_driving_frames, pad_length
122
+
123
  # Global parameters
124
  model_name = "pretrained_models/SkyReels-A1-5B/"
125
  siglip_name = "pretrained_models/SkyReels-A1-5B/siglip-so400m-patch14-384"
 
221
  out_frames = processor.preprocess_lmk3d_from_coef(
222
  source_outputs, source_tform, image_original.shape, driving_outputs
223
  )
224
+ out_frames, pad_length = pad_video(out_frames)
225
+ print(len(out_frames), pad_length)
226
+
227
+
228
+ # out_frames = parse_video(out_frames, max_frame_num)
229
 
230
+ rescale_motions = np.zeros_like(image)[np.newaxis, :].repeat(len(out_frames), axis=0)
231
  for ii in range(rescale_motions.shape[0]):
232
  rescale_motions[ii][y1:y1+face_h, x1:x1+face_w] = out_frames[ii]
233
 
 
243
  first_motion[y1:y1+face_h, x1:x1+face_w] = ref_img
244
  first_motion = first_motion[np.newaxis, :]
245
 
246
+ # motions = np.concatenate([first_motion, rescale_motions])
247
+ # input_video = motions[:max_frame_num]
248
 
249
  # Face alignment
250
  face_helper.clean_all()
 
255
  image_face = align_face[:, :, ::-1]
256
 
257
  # Prepare input video
258
+ # input_video = torch.from_numpy(np.array(input_video)).permute([3, 0, 1, 2]).unsqueeze(0)
259
+ # input_video = input_video / 255
260
 
261
  progress(0.6, desc="Generating animation (this may take a while)...")
262
  # Generate video
263
+ out_samples = []
264
+ for i in range(0, len(rescale_motions), 48):
265
+ motions = np.concatenate([first_motion, rescale_motions[i:i+48]])
266
+ input_video = motions
267
+ input_video = torch.from_numpy(np.array(input_video)).permute([3, 0, 1, 2]).unsqueeze(0)
268
+ input_video = input_video / 255
269
+
270
+ with torch.no_grad():
271
+ sample = pipe(
272
+ image=image,
273
+ image_face=image_face,
274
+ control_video=input_video,
275
+ prompt="",
276
+ negative_prompt="",
277
+ height=480,
278
+ width=720,
279
+ num_frames=49,
280
+ # generator=generator,
281
+ guidance_scale=guidance_scale,
282
+ num_inference_steps=steps,
283
+ )
284
+ if i == 0:
285
+ out_samples.extend(sample.frames[0])
286
+ else:
287
+ out_samples.extend(sample.frames[0][1:])
288
+ # out_samples = sample.frames[0]
289
+
290
+ # out_samples = out_samples[2:] # Skip first two frames
291
+ if pad_length == 0:
292
+ out_samples = out_samples[1:]
293
+ else:
294
+ out_samples = out_samples[1:-pad_length]
295
+
296
  progress(0.8, desc="Creating output video...")
297
  # Export video
298
  export_to_video(out_samples, temp_video_path, fps=12)