Spaces:
Runtime error
Runtime error
import os | |
import torch | |
from PIL import Image | |
from huggingface_hub import snapshot_download, hf_hub_download | |
from videogen_hub import MODEL_PATH | |
class SEINE(): | |
def __init__(self): | |
""" | |
1. Download the pretrained model and put it inside MODEL_PATH/SEINE | |
2. Create Pipeline. | |
""" | |
from videogen_hub.pipelines.seine.SEINEPipeline import SEINEPipeline | |
seine_path = hf_hub_download(repo_id="Vchitect/SEINE", filename="seine.pt", local_dir=os.path.join(MODEL_PATH, "SEINE")) | |
pretrained_model_path = snapshot_download(repo_id="CompVis/stable-diffusion-v1-4", | |
local_dir=os.path.join(MODEL_PATH, "SEINE", "stable-diffusion-v1-4"), | |
ignore_patterns=["*pytorch_model.bin", "*fp16*", "*non_ema*"]) | |
self.pipeline = SEINEPipeline(seine_path, pretrained_model_path, | |
'src/videogen_hub/pipelines/seine/sample_i2v.yaml') | |
def infer_one_video(self, | |
input_image: Image.Image, | |
prompt: str = None, | |
size: list = [320, 512], | |
seconds: int = 2, | |
fps: int = 8, | |
seed: int = 42): | |
""" | |
Generates a single video based on a textual prompt and first frame image, using either a provided image or an image path as the starting point. The output is a tensor representing the video. | |
Args: | |
input_image (PIL.Image.Image): The input image to use as the basis for video generation. | |
prompt (str, optional): The text prompt that guides the video generation. If not specified, the video generation will rely solely on the input image. Defaults to None. | |
size (list, optional): Specifies the resolution of the output video as [height, width]. Defaults to [320, 512]. | |
seconds (int, optional): The duration of the video in seconds. Defaults to 2. | |
fps (int, optional): The number of frames per second in the generated video. This determines how smooth the video appears. Defaults to 8. | |
seed (int, optional): A seed value for random number generation, ensuring reproducibility of the video generation process. Defaults to 42. | |
Returns: | |
torch.Tensor: A tensor representing the generated video, structured as (time, channel, height, width). | |
""" | |
video = self.pipeline.infer_one_video(input_image=input_image, | |
text_prompt=prompt, | |
output_size=size, | |
num_frames=seconds * fps, | |
seed=seed) | |
return video | |