Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -20,19 +20,51 @@ model = AutoModelForCausalLM.from_pretrained("ManishThota/SparrowVQE",
|
|
20 |
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
|
21 |
|
22 |
|
23 |
-
def process_video(video_bytes):
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
def predict_answer(image, video, question, max_tokens=100):
|
38 |
|
@@ -53,7 +85,7 @@ def predict_answer(image, video, question, max_tokens=100):
|
|
53 |
|
54 |
elif video:
|
55 |
# Process as a video
|
56 |
-
frames =
|
57 |
answers = []
|
58 |
for frame in frames:
|
59 |
frame = Image.open(frame).convert("RGB")
|
|
|
20 |
tokenizer = AutoTokenizer.from_pretrained("ManishThota/SparrowVQE", trust_remote_code=True)
|
21 |
|
22 |
|
23 |
+
# def process_video(video_bytes):
|
24 |
+
# """Extracts frames from the video, 1 per second."""
|
25 |
+
# video = cv2.VideoCapture(io.BytesIO(video_bytes))
|
26 |
+
# fps = video.get(cv2.CAP_PROP_FPS)
|
27 |
+
# frames = []
|
28 |
+
# success, frame = video.read()
|
29 |
+
# while success:
|
30 |
+
# frames.append(frame)
|
31 |
+
# for _ in range(int(fps)): # Skip fps frames
|
32 |
+
# success, frame = video.read()
|
33 |
+
# video.release()
|
34 |
+
# return frames[:4] # Return the first 4 frames
|
35 |
|
36 |
+
def video_to_frames(video_path):
|
37 |
+
"""Converts a video file into frames and stores them as PNG images in a list."""
|
38 |
+
# List to hold frames encoded as PNG
|
39 |
+
frames_png = []
|
40 |
+
|
41 |
+
# Open the video file
|
42 |
+
cap = cv2.VideoCapture(video_path)
|
43 |
+
|
44 |
+
# Check if video opened successfully
|
45 |
+
if not cap.isOpened():
|
46 |
+
print("Error opening video file")
|
47 |
+
return frames_png
|
48 |
+
|
49 |
+
# Read until video is completed
|
50 |
+
while cap.isOpened():
|
51 |
+
# Capture frame-by-frame
|
52 |
+
ret, frame = cap.read()
|
53 |
+
|
54 |
+
# If frame is read correctly ret is True
|
55 |
+
if not ret:
|
56 |
+
print("Can't receive frame (stream end?). Exiting ...")
|
57 |
+
break
|
58 |
+
|
59 |
+
# Convert the frame to PNG and store it
|
60 |
+
is_success, buffer = cv2.imencode(".png", frame)
|
61 |
+
if is_success:
|
62 |
+
frames_png.append(np.array(buffer).tobytes())
|
63 |
+
|
64 |
+
# When everything done, release the video capture object
|
65 |
+
cap.release()
|
66 |
+
|
67 |
+
return frames_png
|
68 |
|
69 |
def predict_answer(image, video, question, max_tokens=100):
|
70 |
|
|
|
85 |
|
86 |
elif video:
|
87 |
# Process as a video
|
88 |
+
frames = video_to_frames(video)
|
89 |
answers = []
|
90 |
for frame in frames:
|
91 |
frame = Image.open(frame).convert("RGB")
|