ManishThota commited on
Commit
3e1dcd9
·
verified ·
1 Parent(s): 854d6ec

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
2
+ import torch
3
+ import numpy as np
4
+ import av
5
+ import spaces
6
+ import gradio as gr
7
+ import os
8
+
9
+ quantization_config = BitsAndBytesConfig(
10
+ load_in_4bit=True,
11
+ bnb_4bit_compute_dtype=torch.float16
12
+ )
13
+
14
+ model_name = 'llava-hf/LLaVA-NeXT-Video-7B-DPO-hf'
15
+
16
+ processor = LlavaNextVideoProcessor.from_pretrained(model_name)
17
+ model = LlavaNextVideoForConditionalGeneration.from_pretrained(
18
+ model_name,
19
+ quantization_config=quantization_config,
20
+ device_map='auto'
21
+ )
22
+
23
+ @spaces.GPU
24
+ def read_video_pyav(container, indices):
25
+ '''
26
+ Decode the video with PyAV decoder.
27
+ Args:
28
+ container (av.container.input.InputContainer): PyAV container.
29
+ indices (List[int]): List of frame indices to decode.
30
+ Returns:
31
+ np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
32
+ '''
33
+ frames = []
34
+ container.seek(0)
35
+ start_index = indices[0]
36
+ end_index = indices[-1]
37
+ for i, frame in enumerate(container.decode(video=0)):
38
+ if i > end_index:
39
+ break
40
+ if i >= start_index and i in indices:
41
+ frames.append(frame)
42
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
43
+
44
+ @spaces.GPU
45
+ def process_video(video_file, question):
46
+ # Open video and sample frames
47
+ with av.open(video_file.name) as container: # Access file name from Gradio input
48
+ total_frames = container.streams.video[0].frames
49
+ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
50
+ video_clip = read_video_pyav(container, indices)
51
+
52
+ # Prepare conversation
53
+ conversation = [
54
+ {
55
+ "role": "user",
56
+ "content": [
57
+ {"type": "text", "text": f"{question}"},
58
+ {"type": "video"},
59
+ ],
60
+ },
61
+ ]
62
+ prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
63
+ # Prepare inputs for the model
64
+ input = processor([prompt], videos=[video_clip], padding=True, return_tensors="pt").to(model.device)
65
+
66
+ # Generate output
67
+ generate_kwargs = {"max_new_tokens": 100, "do_sample": True, "top_p": 0.9}
68
+ output = model.generate(**input, **generate_kwargs)
69
+ generated_text = processor.batch_decode(output, skip_special_tokens=True)[0]
70
+
71
+ return generated_text.split("ASSISTANT: ", 1)[-1].strip()
72
+
73
+ @spaces.GPU
74
+ def process_videos(video_files, question):
75
+ """Processes multiple videos and answers a single question for each."""
76
+ answers = []
77
+ for video_file in video_files:
78
+ video_name = os.path.basename(video_file.name)
79
+ answer = process_video(video_file, question)
80
+ answers.append(f"**Video: {video_name}**\n{answer}\n")
81
+ return "\n---\n".join(answers)
82
+
83
+ # Define Gradio interface for multiple videos
84
+ def gradio_interface(videos, question):
85
+ answers = process_videos(videos, question)
86
+ return answers
87
+
88
+ iface = gr.Interface(
89
+ fn=gradio_interface,
90
+ inputs=[
91
+ gr.File(label="Upload Videos", file_count="multiple"),
92
+ gr.Textbox(label="Enter Your Question")
93
+ ],
94
+ outputs=gr.Textbox(label="Generated Answers"),
95
+ title="Video Question Answering",
96
+ description="Upload multiple videos and ask a single question to receive answers tailored to each video."
97
+ )
98
+
99
+ if __name__ == "__main__":
100
+ iface.launch(debug=True)