File size: 4,540 Bytes
2f3d8a1
8d024ac
8af42ca
aa0a087
 
 
984a212
2f3d8a1
 
 
 
 
 
 
8d024ac
f508df4
8d024ac
 
 
 
 
 
8af42ca
8d024ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8af42ca
8d024ac
8933be3
984a212
 
 
 
 
 
 
 
 
afd8ffa
8d024ac
984a212
 
2f3d8a1
984a212
2f3d8a1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8933be3
2f3d8a1
 
8933be3
2f3d8a1
 
8933be3
2f3d8a1
 
 
 
3384297
2f3d8a1
 
afd8ffa
2f3d8a1
 
a066be5
 
 
 
 
 
 
 
2f3d8a1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import gradio as gr
import os
from gradio_client import Client, handle_file
import numpy as np 
import tempfile
import imageio

import torch
from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler

pipe = DiffusionPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe.enable_model_cpu_offload()

hf_token = os.environ.get("HF_TOKEN")

def get_caption(image_in):
    kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
    kosmos2_result = kosmos2_client.predict(
		image_input=handle_file(image_in),
		text_input="Detailed",
		api_name="/generate_predictions"
    )
    print(f"KOSMOS2 RETURNS: {kosmos2_result}")

    data = kosmos2_result[1]

    # Extract and combine tokens starting from the second element
    sentence = ''.join(item['token'] for item in data[1:])

    # Find the last occurrence of "."
    #last_period_index = full_sentence.rfind('.')

    # Truncate the string up to the last period
    #truncated_caption = full_sentence[:last_period_index + 1]

    # print(truncated_caption)
    #print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
    
    return sentence

def export_to_video(frames: np.ndarray, fps: int) -> str:
    frames = np.clip((frames * 255), 0, 255).astype(np.uint8)
    out_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
    writer = imageio.get_writer(out_file.name, format="FFMPEG", fps=fps)
    for frame in frames:
        writer.append_data(frame)
    writer.close()
    return out_file.name

def infer(image_init, progress=gr.Progress(track_tqdm=True)):
    prompt = get_caption(image_init)
    video_frames = pipe(prompt, num_inference_steps=40, height=320, width=576, num_frames=24).frames[0]
    video_path = export_to_video(video_frames, 12)
    print(video_path)
    return prompt, video_path

css = """
#col-container {max-width: 510px; margin-left: auto; margin-right: auto;}
a {text-decoration-line: underline; font-weight: 600;}
.animate-spin {
  animation: spin 1s linear infinite;
}

@keyframes spin {
  from {
      transform: rotate(0deg);
  }
  to {
      transform: rotate(360deg);
  }
}

#share-btn-container {
  display: flex; 
  padding-left: 0.5rem !important; 
  padding-right: 0.5rem !important; 
  background-color: #000000; 
  justify-content: center; 
  align-items: center; 
  border-radius: 9999px !important; 
  max-width: 13rem;
}

#share-btn-container:hover {
  background-color: #060606;
}

#share-btn {
  all: initial; 
  color: #ffffff;
  font-weight: 600; 
  cursor:pointer; 
  font-family: 'IBM Plex Sans', sans-serif; 
  margin-left: 0.5rem !important; 
  padding-top: 0.5rem !important; 
  padding-bottom: 0.5rem !important;
  right:0;
}

#share-btn * {
  all: unset;
}

#share-btn-container div:nth-child(-n+2){
  width: auto !important;
  min-height: 0px !important;
}

#share-btn-container .wrap {
  display: none !important;
}

#share-btn-container.hidden {
  display: none!important;
}
img[src*='#center'] { 
    display: block;
    margin: auto;
}
"""

with gr.Blocks(css=css) as demo:
    with gr.Column(elem_id="col-container"):
        gr.Markdown(
            """
            <h1 style="text-align: center;">Zeroscope Image-to-Video</h1>
            <p style="text-align: center;">
            A watermark-free Modelscope-based video model optimized for producing high-quality 16:9 compositions and a smooth video output. <br />
            This demo is a variation that lets you upload an image as reference for video generation. 
            </p>
            
            [![Duplicate this Space](https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm.svg#center)](https://huggingface.co/spaces/fffiloni/zeroscope-img-to-video?duplicate=true)
            
            """
        )

        image_init = gr.Image(label="Image Init", type="filepath", sources=["upload"], elem_id="image-init")
        #inference_steps = gr.Slider(label="Inference Steps", minimum=10, maximum=100, step=1, value=40, interactive=False)
        submit_btn = gr.Button("Submit")
        coca_cap = gr.Textbox(label="Caption", placeholder="Kosmos-2 caption will be displayed here", elem_id="coca-cap-in")
        video_result = gr.Video(label="Video Output", elem_id="video-output")

    submit_btn.click(
        fn=infer,
        inputs=[image_init],
        outputs=[coca_cap, video_result],
        show_api=False
    )

demo.queue(max_size=12).launch(show_api=False)