MEMO / memo /utils /vision_utils.py
fffiloni's picture
Update memo/utils/vision_utils.py
9645faf verified
import logging
import cv2
import numpy as np
import torch
from insightface.app import FaceAnalysis
from moviepy.editor import AudioFileClip, VideoClip
from PIL import Image
from torchvision import transforms
logger = logging.getLogger(__name__)
def tensor_to_video(tensor, output_video_path, input_audio_path, fps=30):
"""
Converts a Tensor with shape [c, f, h, w] into a video and adds an audio track from the specified audio file.
Args:
tensor (Tensor): The Tensor to be converted, shaped [c, f, h, w].
output_video_path (str): The file path where the output video will be saved.
input_audio_path (str): The path to the audio file (WAV file) that contains the audio track to be added.
fps (int): The frame rate of the output video. Default is 30 fps.
"""
tensor = tensor.permute(1, 2, 3, 0).cpu().numpy() # convert to [f, h, w, c]
tensor = np.clip(tensor * 255, 0, 255).astype(np.uint8) # to [0, 255]
def make_frame(t):
frame_index = min(int(t * fps), tensor.shape[0] - 1)
return tensor[frame_index]
video_duration = tensor.shape[0] / fps
audio_clip = AudioFileClip(input_audio_path)
audio_duration = audio_clip.duration
final_duration = min(video_duration, audio_duration)
audio_clip = audio_clip.subclip(0, final_duration)
new_video_clip = VideoClip(make_frame, duration=final_duration)
new_video_clip = new_video_clip.set_audio(audio_clip)
new_video_clip.write_videofile(output_video_path, fps=fps, codec="libx264", audio_codec="aac")
@torch.no_grad()
def preprocess_image(face_analysis_model: str, image_path: str, image_size: int = 512):
"""
Preprocess the image and extract face embedding.
Args:
face_analysis_model (str): Path to the FaceAnalysis model directory.
image_path (str): Path to the image file.
image_size (int, optional): Target size for resizing the image. Default is 512.
Returns:
tuple: A tuple containing:
- pixel_values (torch.Tensor): Tensor of the preprocessed image.
- face_emb (torch.Tensor): Tensor of the face embedding.
"""
# Define the image transformation
transform = transforms.Compose(
[
transforms.Resize((image_size, image_size)),
transforms.ToTensor(),
transforms.Normalize([0.5], [0.5]),
]
)
# Initialize the FaceAnalysis model
face_analysis = FaceAnalysis(
# name='',
root=face_analysis_model,
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
)
face_analysis.prepare(ctx_id=0, det_size=(640, 640))
# Load and preprocess the image
image = Image.open(image_path).convert("RGB")
pixel_values = transform(image)
pixel_values = pixel_values.unsqueeze(0)
# Detect faces and extract the face embedding
image_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
faces = face_analysis.get(image_bgr)
if not faces:
logger.warning("No faces detected in the image. Using a zero vector as the face embedding.")
face_emb = np.zeros(512)
else:
# Sort faces by size and select the largest one
faces_sorted = sorted(
faces,
key=lambda x: (x["bbox"][2] - x["bbox"][0]) * (x["bbox"][3] - x["bbox"][1]),
reverse=True,
)
face_emb = faces_sorted[0]["embedding"]
# Convert face embedding to a PyTorch tensor
face_emb = face_emb.reshape(1, -1)
face_emb = torch.tensor(face_emb)
del face_analysis
return pixel_values, face_emb