from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

model_path = "/home/zbz5349/WorkSpace/aigeeks/Qwen2.5-VL/ckpt"
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_path, torch_dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-VL-7B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processor
processor = AutoProcessor.from_pretrained(model_path)

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
{'role': 'user', 'content': [{'type': 'video', 'video': '/home/zbz5349/WorkSpace/aigeeks/Qwen2.5-VL/dataset/data/new_Psychology_5.mp4', 'max_pixels': 151200, 'fps': 1.0}, {'type': 'image', 'image': '/home/zbz5349/WorkSpace/aigeeks/Qwen2.5-VL/dataset/images/new_Psychology_5.png', 'text': '<video> <image>Evaluate the following statements based on the video about different psychology research methods. Identify which statements are correct.\nStatements:\n1. A natural quasi-experiment involves the manipulation of independent variables by the researcher in a controlled environment to determine their effect on dependent variables.\n2. Naturalistic observation requires researchers to observe subjects in their natural environments without any interference or manipulation, thereby ensuring high ecological validity.\n3. Laboratory experiments allow researchers to control and manipulate variables precisely, which increases the internal validity of the study but may reduce its ecological validity.\n4. In a natural quasi-experiment, the independent variable is manipulated by the researcher to observe its effects on the dependent variable in a real-world setting.\n5. Correlational studies can establish causal relationships between variables by measuring the strength and direction of their association. this is a multiple-choice, options contain A : [Only Statements 2 and 3 are correct] B : [Statements 1, 2, and 3 are correct] C : [Statements 2, 3, and 4 are correct] D : [Statements 1, 3, and 5 are correct] E : [Only Statement 4 is correct] F : [Statements 2 and 4 are correct] G : [Statements 1, 2, 3, and 4 are correct] H : [Statements 3 and 5 are correct] I : [All statements are correct] J : [Statements 2, 3, 4, and 5 are correct] '}, {'type': 'text', 'text': 'Describe this video.'}]}
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    fps=fps,
    padding=True,
    return_tensors="pt",
    **video_kwargs,
)
inputs = inputs.to("cuda")

# Inference
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)