Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -83,34 +83,46 @@ def predict_answer(video, image, question):
|
|
83 |
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
84 |
|
85 |
elif video:
|
86 |
-
# Process as a video
|
87 |
frames = video_to_frames(video)
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
image_tensor = model.image_preprocess([image])
|
92 |
-
|
93 |
-
# Generate the answer
|
94 |
output_ids = model.generate(
|
95 |
input_ids,
|
96 |
max_new_tokens=25,
|
97 |
images=image_tensor,
|
98 |
use_cache=True)[0]
|
99 |
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
102 |
|
103 |
-
# Modify this logic based on your specific needs
|
104 |
-
most_common_answer = Counter(answers).most_common(1)[0][0]
|
105 |
|
106 |
-
# Safely evaluate the most common answer assuming it's a string representation of a Python literal
|
107 |
-
try:
|
108 |
-
|
109 |
-
except (ValueError, SyntaxError):
|
110 |
-
|
111 |
-
|
112 |
|
113 |
-
return evaluated_answer
|
114 |
|
115 |
# return ast.literal_eval(answers[0])
|
116 |
|
|
|
83 |
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
84 |
|
85 |
elif video:
|
|
|
86 |
frames = video_to_frames(video)
|
87 |
+
image = extract_frames(frames[2])
|
88 |
+
image_tensor = model.image_preprocess([image])
|
89 |
+
# Generate the answer
|
|
|
|
|
|
|
90 |
output_ids = model.generate(
|
91 |
input_ids,
|
92 |
max_new_tokens=25,
|
93 |
images=image_tensor,
|
94 |
use_cache=True)[0]
|
95 |
|
96 |
+
return tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
97 |
+
|
98 |
+
# # Process as a video
|
99 |
+
# frames = video_to_frames(video)
|
100 |
+
# answers = []
|
101 |
+
# for frame in frames:
|
102 |
+
# image = extract_frames(frame)
|
103 |
+
# image_tensor = model.image_preprocess([image])
|
104 |
+
|
105 |
+
# # Generate the answer
|
106 |
+
# output_ids = model.generate(
|
107 |
+
# input_ids,
|
108 |
+
# max_new_tokens=25,
|
109 |
+
# images=image_tensor,
|
110 |
+
# use_cache=True)[0]
|
111 |
+
|
112 |
+
# answer = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
|
113 |
+
# answers.append(answer)
|
114 |
|
115 |
+
# # Modify this logic based on your specific needs
|
116 |
+
# most_common_answer = Counter(answers).most_common(1)[0][0]
|
117 |
|
118 |
+
# # Safely evaluate the most common answer assuming it's a string representation of a Python literal
|
119 |
+
# try:
|
120 |
+
# evaluated_answer = ast.literal_eval(most_common_answer)
|
121 |
+
# except (ValueError, SyntaxError):
|
122 |
+
# # Handle malformed answer string
|
123 |
+
# evaluated_answer = f"Error evaluating answer: {most_common_answer}"
|
124 |
|
125 |
+
# return evaluated_answer
|
126 |
|
127 |
# return ast.literal_eval(answers[0])
|
128 |
|