File size: 4,601 Bytes
4ef3fa0 f6cb684 4ef3fa0 fb68207 4ef3fa0 fb68207 bafe453 4ef3fa0 f6cb684 4ef3fa0 f6cb684 4ef3fa0 f6cb684 4ef3fa0 f6cb684 4ef3fa0 26ce584 4ef3fa0 26ce584 4ef3fa0 f6cb684 4ef3fa0 f6cb684 4ef3fa0 f6cb684 4ef3fa0 fb68207 f6cb684 fb68207 f6cb684 bafe453 f6cb684 bafe453 f6cb684 fb68207 59899f0 fb68207 4ef3fa0 fb68207 f6cb684 4ef3fa0 409ee88 fb68207 4ef3fa0 fb68207 4ef3fa0 fb68207 4ef3fa0 fb68207 4ef3fa0 bafe453 4ef3fa0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
import torch
import numpy as np
from diffusers import I2VGenXLPipeline
from transformers import MusicgenForConditionalGeneration, AutoProcessor
from PIL import Image
from moviepy.editor import ImageSequenceClip
import io
import scipy.io.wavfile
import ffmpeg
def generate_video(image, prompt, negative_prompt, video_length):
generator = torch.manual_seed(8888)
# Set the device to CPU or a non-NVIDIA GPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")
# Load the pipeline
pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float32)
pipeline.to(device) # Move the model to the selected device
# Generate frames with progress tracking
frames = []
total_frames = video_length * 20 # Assuming 30 frames per second
for i in range(total_frames):
frame = pipeline(
prompt=prompt,
image=image,
num_inference_steps=1,
negative_prompt=negative_prompt,
guidance_scale=9.0,
generator=generator,
num_frames=1
).frames[0]
frames.append(np.array(frame))
# Update progress
yield (i + 1) / total_frames # Yield progress
# Create a video clip from the frames
output_file = "output_video.mp4"
clip = ImageSequenceClip(frames, fps=30) # Set the frames per second
clip.write_videofile(output_file, codec='libx264', audio=False)
return output_file
def generate_music(prompt, unconditional=False):
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)
# Generate music
if unconditional:
unconditional_inputs = model.get_unconditional_inputs(num_samples=1)
audio_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=256)
else:
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
inputs = processor(
text=prompt,
padding=True,
return_tensors="pt",
)
audio_values = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=256)
sampling_rate = model.config.audio_encoder.sampling_rate
audio_file = "musicgen_out.wav"
# Ensure audio_values is 1D and scale if necessary
audio_data = audio_values[0].cpu().numpy()
# Check if audio_data is in the correct format
if audio_data.ndim > 1:
audio_data = audio_data[0] # Take the first channel if stereo
# Scale audio data to 16-bit PCM format
audio_data = np.clip(audio_data, -1.0, 1.0) # Ensure values are in the range [-1, 1]
audio_data = (audio_data * 32767).astype(np.int16) # Scale to int16
# Save the generated audio
scipy.io.wavfile.write(audio_file, sampling_rate, audio_data)
return audio_file
def combine_audio_video(audio_file, video_file):
output_file = "combined_output.mp4"
audio = ffmpeg.input(audio_file)
video = ffmpeg.input(video_file)
output = ffmpeg.output(video, audio, output_file, vcodec='copy', acodec='aac')
ffmpeg.run(output)
return output_file
def interface(image_path, prompt, negative_prompt, video_length, music_prompt, unconditional):
image = Image.open(image_path)
video_file = generate_video(image, prompt, negative_prompt, video_length)
audio_file = generate_music(music_prompt, unconditional)
combined_file = combine_audio_video(audio_file, video_file)
return combined_file
with gr.Blocks() as demo:
gr.Markdown("# AI-Powered Video and Music Generation")
with gr.Row():
image_input = gr.Image(type="filepath", label="Upload Image")
prompt_input = gr.Textbox(label="Enter the Video Prompt")
negative_prompt_input = gr.Textbox(label="Enter the Negative Prompt")
video_length_input = gr.Number(label="Video Length (seconds)", value=10, precision=0)
music_prompt_input = gr.Textbox(label="Enter the Music Prompt")
unconditional_checkbox = gr.Checkbox(label="Generate Unconditional Music")
generate_button = gr.Button("Generate Video and Music")
output_video = gr.Video(label="Output Video with Sound")
generate_button.click(
interface,
inputs=[image_input, prompt_input, negative_prompt_input, video_length_input, music_prompt_input, unconditional_checkbox],
outputs=output_video,
show_progress=True
)
demo.launch()
|