Spaces:
Sleeping
Sleeping
import os | |
import cv2 | |
import numpy as np | |
from moviepy.editor import VideoFileClip, CompositeVideoClip, ColorClip | |
from tqdm import tqdm | |
import glob | |
import concurrent.futures | |
import time | |
import random | |
def create_text_overlay(text, subtitle, width, height, start_time, duration): | |
overlay = np.zeros((height, width, 4), dtype=np.uint8) | |
box_width = int(width * 0.75) | |
box_x_start = (width - box_width) // 2 | |
cv2.rectangle(overlay, | |
(box_x_start, height//3), | |
(box_x_start + box_width, 2*height//3), | |
(0,0,0,180), -1) | |
font = cv2.FONT_HERSHEY_DUPLEX | |
if "AirLetters" in text: | |
title_scale = 3.0 | |
subtitle_scale = 1.5 | |
envelope_emoji = cv2.imread("emoji/envelope.png", cv2.IMREAD_UNCHANGED) | |
wind_emoji = cv2.imread("emoji/wind.png", cv2.IMREAD_UNCHANGED) | |
target_height = int(title_scale * 30) | |
envelope_aspect = envelope_emoji.shape[1] / envelope_emoji.shape[0] | |
wind_aspect = wind_emoji.shape[1] / wind_emoji.shape[0] | |
envelope_emoji = cv2.resize(envelope_emoji, | |
(int(target_height * envelope_aspect), target_height)) | |
wind_emoji = cv2.resize(wind_emoji, | |
(int(target_height * wind_aspect), target_height)) | |
else: | |
title_scale = 2.0 | |
subtitle_scale = 1.0 | |
title_color = (138, 223, 178, 255) | |
subtitle_color = (255, 255, 255, 255) | |
title_size = cv2.getTextSize(text, font, title_scale, 2)[0] | |
# Center text within the box | |
title_x = box_x_start + (box_width - title_size[0]) // 2 | |
title_y = height // 2 | |
if "AirLetters" in text: | |
emoji_y = title_y - target_height + 5 | |
envelope_x = title_x - envelope_emoji.shape[1] - 20 | |
wind_x = title_x + title_size[0] + 20 | |
def overlay_image_with_alpha(background, foreground, x, y): | |
if x >= background.shape[1] or y >= background.shape[0]: | |
return | |
h, w = foreground.shape[:2] | |
if len(foreground.shape) == 2: | |
alpha = foreground | |
foreground = cv2.cvtColor(foreground, cv2.COLOR_GRAY2BGR) | |
else: | |
alpha = foreground[:, :, 3] / 255.0 | |
foreground = foreground[:, :, :3] | |
y1, y2 = max(0, y), min(background.shape[0], y + h) | |
x1, x2 = max(0, x), min(background.shape[1], x + w) | |
alpha_slice = alpha[y1-y:y2-y, x1-x:x2-x] | |
alpha_expanded = np.expand_dims(alpha_slice, axis=-1) | |
background_slice = background[y1:y2, x1:x2, :3] | |
foreground_slice = foreground[y1-y:y2-y, x1-x:x2-x] | |
background[y1:y2, x1:x2, :3] = background_slice * (1 - alpha_expanded) + foreground_slice * alpha_expanded | |
background[y1:y2, x1:x2, 3] = background[y1:y2, x1:x2, 3] * (1 - alpha_slice) + 255 * alpha_slice | |
overlay_image_with_alpha(overlay, envelope_emoji, envelope_x, emoji_y) | |
overlay_image_with_alpha(overlay, wind_emoji, wind_x, emoji_y) | |
else: | |
if len(subtitle) > 50: | |
words = subtitle.split() | |
mid = len(words) // 2 | |
subtitle = " ".join(words[:mid]) + "\n" + " ".join(words[mid:]) | |
title_size = cv2.getTextSize(text, font, title_scale, 2)[0] | |
title_x = box_x_start + (box_width - title_size[0]) // 2 | |
title_y = height // 2 | |
cv2.putText(overlay, text, (title_x, title_y), font, title_scale, title_color, 2) | |
if "\n" in subtitle: | |
subtitle_lines = subtitle.split("\n") | |
subtitle_y = title_y + 50 | |
for line in subtitle_lines: | |
subtitle_size = cv2.getTextSize(line, font, subtitle_scale, 2)[0] | |
subtitle_x = box_x_start + (box_width - subtitle_size[0]) // 2 | |
cv2.putText(overlay, line, (subtitle_x, subtitle_y), font, subtitle_scale, subtitle_color, 2) | |
subtitle_y += 50 | |
else: | |
subtitle_size = cv2.getTextSize(subtitle, font, subtitle_scale, 2)[0] | |
subtitle_x = box_x_start + (box_width - subtitle_size[0]) // 2 | |
cv2.putText(overlay, subtitle, (subtitle_x, title_y + 60), font, subtitle_scale, subtitle_color, 2) | |
overlay_clip = ColorClip(size=(width, height), color=[0,0,0,0]) | |
overlay_clip.mask = ColorClip(size=(width, height), color=[1,1,1,1]) | |
overlay_clip.mask.get_frame = lambda t: overlay[:,:,3:4] / 255.0 | |
overlay_clip.get_frame = lambda t: overlay[:,:,:3] | |
overlay_clip = overlay_clip.set_start(start_time) | |
overlay_clip = overlay_clip.set_duration(duration) | |
overlay_clip = overlay_clip.fadein(0.5).fadeout(0.5) | |
return overlay_clip | |
def load_video(args): | |
video_path, target_size, padding, idx, grid_width = args | |
try: | |
clip = VideoFileClip(video_path, audio=False) | |
clip = clip.resize(height=target_size) | |
clip = clip.crop(x1=(clip.w - target_size)//2, x2=(clip.w + target_size)//2) if clip.w > target_size else clip | |
clip = clip.loop() | |
bg = ColorClip(size=(target_size + padding*2, target_size + padding*2), color=(255,255,255)) | |
clip = clip.set_position((padding, padding)) | |
clip = CompositeVideoClip([bg, clip]) | |
x = (idx % grid_width) * (target_size + padding*2) | |
y = (idx // grid_width) * (target_size + padding*2) | |
clip = clip.set_position((x, y)) | |
return clip | |
except Exception as e: | |
print(f"\nError processing {video_path}: {str(e)}") | |
return None | |
def create_montage(video_dir, output_path, width=1920, height=1080, fps=30): | |
print("Starting video creation...") | |
start_time = time.time() | |
TOTAL_DURATION = 15 | |
FIRST_PHASE = 5 | |
TRANSITION = 5 | |
FINAL_PHASE = 5 | |
video_paths = glob.glob(os.path.join(video_dir, "*.mp4")) | |
base_grid_videos = 400 | |
aspect_ratio = 16/9 | |
grid_width = int(np.sqrt(base_grid_videos * aspect_ratio)) | |
grid_height = int(np.sqrt(base_grid_videos / aspect_ratio)) | |
padding = 1 | |
target_size = min(width // grid_width, height // grid_height) - padding*2 | |
print(f"Creating grid of {grid_width}x{grid_height} videos") | |
print(f"Video size: {target_size}x{target_size} pixels") | |
needed_videos = grid_width * grid_height | |
if len(video_paths) > needed_videos: | |
video_paths = random.sample(video_paths, needed_videos) | |
args_list = [(path, target_size, padding, idx, grid_width) | |
for idx, path in enumerate(video_paths)] | |
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor: | |
futures = list(tqdm( | |
executor.map(load_video, args_list), | |
total=len(args_list), | |
desc="Loading videos" | |
)) | |
clips = [clip for clip in futures if clip is not None] | |
if not clips: | |
raise ValueError("No videos were successfully loaded!") | |
bg = ColorClip((width, height), color=(0, 0, 0)) | |
video_clips = [bg] + clips | |
print("Creating video composition...") | |
video_comp = CompositeVideoClip(video_clips, size=(width, height)) | |
w, h = video_comp.size | |
def get_zoom_crop(t): | |
if t < FIRST_PHASE: | |
return (w, h) | |
elif t < FIRST_PHASE + TRANSITION: | |
progress = (t - FIRST_PHASE) / TRANSITION | |
zoom_factor = 1 + (progress * 2) | |
else: | |
zoom_factor = 3 | |
return (int(w/zoom_factor), int(h/zoom_factor)) | |
def apply_zoom(gf, t): | |
frame = gf(t) | |
cw, ch = get_zoom_crop(t) | |
if cw >= w or ch >= h: | |
return frame | |
x = (w - cw) // 2 | |
y = (h - ch) // 2 | |
cropped = frame[y:y+ch, x:x+cw] | |
return cv2.resize(cropped, (w, h), interpolation=cv2.INTER_LINEAR) | |
video_comp = video_comp.fl(apply_zoom) | |
video_comp = video_comp.set_duration(TOTAL_DURATION) | |
text1 = create_text_overlay( | |
"AirLetters", | |
"\nAn Open Video Dataset of Characters Drawn in the Air", | |
width, height, 0, FIRST_PHASE | |
) | |
text2 = create_text_overlay( | |
"Novel Video Understanding Benchmark", | |
"for evaluating the ability to understand articulated motions which requires very strong temporal capabilities, a task very challenging for current models", | |
width, height, FIRST_PHASE + TRANSITION, FINAL_PHASE | |
) | |
final = CompositeVideoClip([video_comp, text1, text2]) | |
print("Writing final video...") | |
final.write_videofile( | |
output_path, | |
fps=fps, | |
codec='libx264', | |
audio=False, | |
threads=16, | |
logger='bar' | |
) | |
print("Cleaning up...") | |
final.close() | |
for clip in clips: | |
if clip is not None: | |
clip.close() | |
print(f"\nTotal processing time: {time.time() - start_time:.2f} seconds") | |
print(f"Output saved to: {output_path}") | |
if __name__ == "__main__": | |
create_montage( | |
video_dir="airletters/videos", | |
output_path="30fps.mp4", | |
fps=30, | |
) |