symlink commited on
Commit
7055c14
·
1 Parent(s): 50cc1b6
Files changed (1) hide show
  1. app.py +46 -18
app.py CHANGED
@@ -1,24 +1,52 @@
1
- # Load model directly
2
- #from transformers import AutoProcessor, AutoModelForCausalLM
 
 
3
 
4
- #processor = AutoProcessor.from_pretrained("lmms-lab/LLaVA-NeXT-Video-32B-Qwen")
5
- #model = AutoModelForCausalLM.from_pretrained("lmms-lab/LLaVA-NeXT-Video-32B-Qwen")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- import gradio as gr
8
- from transformers import pipeline
 
9
 
10
- pipeline = pipeline(task="image-classification", model="llms-lab/LLaVA-NeXT-Video-32BQwen")
 
 
 
 
 
11
 
12
- def predict(input_img):
13
- predictions = pipeline(input_img)
14
- return input_img, {p["label"]: p["score"] for p in predictions}
15
 
16
- gradio_app = gr.Interface(
17
- predict,
18
- inputs=gr.Image(label="Select hot dog candidate", sources=['upload', 'webcam'], type="pil"),
19
- outputs=[gr.Image(label="Processed Image"), gr.Label(label="Result", num_top_classes=2)],
20
- title="Hot Dog? Or Not?",
21
- )
 
22
 
23
- if __name__ == "__main__":
24
- gradio_app.launch()
 
 
 
 
1
+ import av
2
+ import torch
3
+ import numpy as np
4
+ from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
5
 
6
+ def read_video_pyav(container, indices):
7
+ '''
8
+ Decode the video with PyAV decoder.
9
+ Args:
10
+ container (`av.container.input.InputContainer`): PyAV container.
11
+ indices (`List[int]`): List of frame indices to decode.
12
+ Returns:
13
+ result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
14
+ '''
15
+ frames = []
16
+ container.seek(0)
17
+ start_index = indices[0]
18
+ end_index = indices[-1]
19
+ for i, frame in enumerate(container.decode(video=0)):
20
+ if i > end_index:
21
+ break
22
+ if i >= start_index and i in indices:
23
+ frames.append(frame)
24
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
25
 
26
+ # Load the model in half-precision
27
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf", torch_dtype=torch.float16, device_map="auto")
28
+ processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
29
 
30
+ # Load the video as an np.array, sampling uniformly 8 frames (can sample more for longer videos)
31
+ video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
32
+ container = av.open(video_path)
33
+ total_frames = container.streams.video[0].frames
34
+ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
35
+ video = read_video_pyav(container, indices)
36
 
37
+ conversation = [
38
+ {
 
39
 
40
+ "role": "user",
41
+ "content": [
42
+ {"type": "text", "text": "Why is this video funny?"},
43
+ {"type": "video"},
44
+ ],
45
+ },
46
+ ]
47
 
48
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
49
+ inputs = processor(text=prompt, videos=video, return_tensors="pt")
50
+
51
+ out = model.generate(**inputs, max_new_tokens=60)
52
+ processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)