|
import argparse |
|
import logging |
|
|
|
import copy |
|
import codecs |
|
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig |
|
import torch |
|
import decord |
|
import os |
|
import json |
|
import random |
|
import requests |
|
from tqdm import tqdm |
|
import numpy as np |
|
|
|
from llava.constants import MM_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, DEFAULT_VIDEO_START_TOKEN, DEFAULT_VIDEO_END_TOKEN |
|
from llava.conversation import conv_templates, SeparatorStyle |
|
from llava.utils import disable_torch_init |
|
from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria, process_images_v2 |
|
from llava.model import * |
|
from llava.model.builder import load_pretrained_model |
|
from llava.model.multimodal_encoder.processor import Blip2ImageTrainProcessor |
|
|
|
from transformers import CLIPImageProcessor |
|
from PIL import Image |
|
from decord import VideoReader, cpu |
|
|
|
decord.bridge.set_bridge("torch") |
|
|
|
|
|
|
|
def get_image(image_path): |
|
image = Image.open(image_path).convert('RGB') |
|
return image |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_frames(frames_dir): |
|
results = [] |
|
image_files = [(int(os.path.splitext(img)[0]), img) for img in os.listdir(frames_dir) if not img.startswith('cuttime')] |
|
image_files = sorted(image_files, key=lambda img: img[0]) |
|
for frame_name in image_files: |
|
image_path = f"{frames_dir}/{frame_name[1]}" |
|
image = get_image(image_path) |
|
results.append(image) |
|
return results |
|
|
|
|
|
|
|
|
|
def uniform_sample(frames, num_segments): |
|
indices = np.linspace(start=0, stop=len(frames) - 1, num=num_segments).astype(int) |
|
frames = [frames[ind] for ind in indices] |
|
return frames |
|
|
|
|
|
|
|
|
|
def run_inference(args, frame_folders): |
|
disable_torch_init() |
|
model_path = os.path.expanduser(args.model_path) |
|
model_name = get_model_name_from_path(model_path) |
|
tokenizer, model, _, context_len = load_pretrained_model(model_path, args.model_base, model_name, device_map={"":0}) |
|
image_processor = Blip2ImageTrainProcessor( |
|
image_size=model.config.img_size, |
|
is_training=False) |
|
model_cfgs = model.config |
|
|
|
|
|
for frame_folder in frame_folders: |
|
question = "Describe the video in detail." |
|
|
|
|
|
qs = question |
|
|
|
if model.config.mm_use_start_end: |
|
qs = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_TOKEN + DEFAULT_VIDEO_END_TOKEN + '\n' + qs |
|
else: |
|
qs = DEFAULT_VIDEO_TOKEN + '\n' + qs |
|
|
|
conv = conv_templates[args.conv_mode].copy() |
|
conv.append_message(conv.roles[0], qs) |
|
conv.append_message(conv.roles[1], None) |
|
prompt = conv.get_prompt() |
|
|
|
|
|
input_ids = tokenizer_image_token(prompt, tokenizer, MM_TOKEN_INDEX, return_tensors='pt').unsqueeze( |
|
0).cuda() |
|
|
|
|
|
|
|
images = load_frames(frame_folder) |
|
|
|
if len(images) > args.num_segments: |
|
images = uniform_sample(images, args.num_segments) |
|
elif len(images) < args.num_segments: |
|
|
|
images = uniform_sample(images, args.num_segments) |
|
else: |
|
pass |
|
|
|
if model_cfgs.image_aspect_ratio == 'pad': |
|
model_cfgs.image_aspect_ratio = 'no_padding' |
|
images_tensor = process_images_v2(images, image_processor, model_cfgs).half().cuda() |
|
|
|
|
|
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 |
|
keywords = [stop_str] |
|
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) |
|
images_tensors = [images_tensor.clone() for _ in range(args.num_beams)] |
|
with torch.inference_mode(): |
|
output_ids = model.generate( |
|
input_ids, |
|
images= images_tensors, |
|
do_sample=True, |
|
temperature=args.temperature, |
|
top_p=args.top_p, |
|
num_beams=args.num_beams, |
|
no_repeat_ngram_size=args.no_repeat_ngram_size, |
|
pad_token_id=tokenizer.eos_token_id, |
|
max_new_tokens=1024, |
|
use_cache=True, |
|
stopping_criteria=[stopping_criteria]) |
|
|
|
|
|
outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0] |
|
|
|
outputs = outputs.strip() |
|
if outputs.endswith(conv.sep): |
|
outputs = outputs[:-len(stop_str)] |
|
outputs = outputs.strip() |
|
print(outputs) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--video_dir', help='Directory containing video files.', type=str, default="") |
|
parser.add_argument('--validation_data', type=str, |
|
default="/mnt/bn/yukunfeng-nasdrive/xiangchen/repo/benchmark_data/refine_chair_eval_gt_neg_1k.json") |
|
parser.add_argument('--num_samples', help='Number of samples to predict', type=int, default=-1) |
|
parser.add_argument("--model_path", type=str, |
|
default="/mnt/bn/algo-masp-nas-2/xiangchen/model/masp_models/checkpoints/llava-mistral_gpt4v_adso185k_unfreeze_qformer_data_sampler/") |
|
parser.add_argument("--model_base", type=str, default=None) |
|
parser.add_argument("--conv_mode", type=str, default="v1") |
|
parser.add_argument("--output_file", type=str, default="vid_top1k_res.json") |
|
parser.add_argument("--num_segments", type=int, default=10) |
|
parser.add_argument("--temperature", type=float, default=0.2) |
|
parser.add_argument("--top_p", type=float, default=None) |
|
parser.add_argument("--num_beams", type=int, default=1) |
|
parser.add_argument("--no_repeat_ngram_size", type=int, default=3) |
|
|
|
args = parser.parse_args() |
|
frame_folders = ['/mnt/bn/algo-masp-nas-2/xiangchen/repo/LLaVA/tmp/cases/yj'] |
|
run_inference(args, frame_folders) |
|
|