TianheWu
/

VisualQuality-R1-7B-preview

@@ -21,47 +21,265 @@ This is a demo version of VisualQuality-R1 which is trained on the combination o
 Paper link: [arXiv](https://arxiv.org/abs/2505.14460)
-## Quick Start
 ```python
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
-import json
-import numpy as np
 import torch
 import random
 import re
 import os
-def score_image(model_path, image_path):
-    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-        model_path,
-        torch_dtype=torch.bfloat16,
-        attn_implementation="flash_attention_2",
-        device_map=device,
     )
-    processor = AutoProcessor.from_pretrained(MODEL_PATH)
-    processor.tokenizer.padding_side = "left"
     PROMPT = (
         "You are doing the image quality assessment task. Here is the question: "
         "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
         "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality."
     )
-    x = {
-        "image": [image_path],
-        "question": PROMPT,
-    }
     QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
     message = [
         {
             "role": "user",
             "content": [
-                *({'type': 'image', 'image': img_path} for img_path in x['image']),
-                {"type": "text", "text": QUESTION_TEMPLATE.format(Question=x['question'])}
             ],
         }
     ]
@@ -81,7 +299,7 @@ def score_image(model_path, image_path):
     inputs = inputs.to(device)
     # Inference: Generation of the output
-    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=256, do_sample=True)
     generated_ids_trimmed = [
         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
@@ -92,25 +310,287 @@ def score_image(model_path, image_path):
     reasoning = re.findall(r'<think>(.*?)</think>', batch_output_text[0], re.DOTALL)
     reasoning = reasoning[-1].strip()
-    model_output_matches = re.findall(r'<answer>(.*?)</answer>', batch_output_text[0], re.DOTALL)
-    model_answer = model_output_matches[-1].strip()
-    score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
     return reasoning, score
-random.seed(42)
-device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
-### Modify here
-model_path = ""
 image_path = ""
 reasoning, score = score_image(
-    model_path=model_path,
-    image_path=image_path
 )
 print(reasoning)
 print(score)
-```

 Paper link: [arXiv](https://arxiv.org/abs/2505.14460)
+## ⚡Quick Start
+### Non-Thinking Inference
+When you execute inference with VisualQuality-R1 as a reward/evaluation model, you can only use **non-thinking** mode to reduce inference time, generating only a single output token with the following prompt:
+```
+PROMPT = (
+    "You are doing the image quality assessment task. Here is the question: "
+    "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
+    "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality."
+)
+QUESTION_TEMPLATE = "{Question} Please only output the final answer with only one score in <answer> </answer> tags."
+```
+For single image quality rating, the code is:
+<details>
+<summary>Example Code (VisualQuality-R1: Image Quality Rating with non-thinking mode)</summary>
 ```python
 from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
 from qwen_vl_utils import process_vision_info
 import torch
 import random
 import re
 import os
+def score_image(image_path, model, processor):
+    PROMPT = (
+        "You are doing the image quality assessment task. Here is the question: "
+        "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
+        "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality. "
+        "First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
     )
+    QUESTION_TEMPLATE = "{Question} Please only output the final answer with only one score in <answer> </answer> tags."
+    message = [
+        {
+            "role": "user",
+            "content": [
+                {'type': 'image', 'image': image_path},
+                {"type": "text", "text": PROMPT}
+            ],
+        }
+    ]
+    batch_messages = [message]
+    # Preparation for inference
+    text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]
+    image_inputs, video_inputs = process_vision_info(batch_messages)
+    inputs = processor(
+        text=text,
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pt",
+    )
+    inputs = inputs.to(device)
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=2048, do_sample=True, top_k=50, top_p=1)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    batch_output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+    )
+    reasoning = None
+    try:
+        model_output_matches = re.findall(r'<answer>(.*?)</answer>', batch_output_text[0], re.DOTALL)
+        model_answer = model_output_matches[-1].strip() if model_output_matches else batch_output_text[0].strip()
+        score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
+    except:
+        print(f"================= Meet error with {img_path}, please generate again. =================")
+        score = random.randint(1, 5)
+    return reasoning, score
+random.seed(1)
+MODEL_PATH = ""
+device = torch.device("cuda:5") if torch.cuda.is_available() else torch.device("cpu")
+image_path = ""
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map=device,
+)
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+processor.tokenizer.padding_side = "left"
+reasoning, score = score_image(
+    image_path, model, processor
+)
+print(score)
+```
+</details>
+<details>
+<summary>Example Code (VisualQuality-R1: Batch Images Quality Rating with non-thinking mode)</summary>
+```python
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from tqdm import tqdm
+import torch
+import random
+import re
+import os
+def get_image_paths(folder_path):
+    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'}
+    image_paths = []
+    for root, dirs, files in os.walk(folder_path):
+        for file in files:
+            _, ext = os.path.splitext(file)
+            if ext.lower() in image_extensions:
+                image_paths.append(os.path.join(root, file))
+    return image_paths
+def score_batch_image(image_paths, model, processor):
     PROMPT = (
         "You are doing the image quality assessment task. Here is the question: "
         "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
         "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality."
     )
+    QUESTION_TEMPLATE = "{Question} Please only output the final answer with only one score in <answer> </answer> tags."
+    messages = []
+    for img_path in image_paths:
+        message = [
+            {
+                "role": "user",
+                "content": [
+                    {'type': 'image', 'image': img_path},
+                    {"type": "text", "text": QUESTION_TEMPLATE.format(Question=PROMPT)}
+                ],
+            }
+        ]
+        messages.append(message)
+    BSZ = 32
+    all_outputs = []  # List to store all answers
+    for i in tqdm(range(0, len(messages), BSZ)):
+        batch_messages = messages[i:i + BSZ]
+        # Preparation for inference
+        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]
+        image_inputs, video_inputs = process_vision_info(batch_messages)
+        inputs = processor(
+            text=text,
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(device)
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=512, do_sample=True, top_k=50, top_p=1)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        batch_output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        all_outputs.extend(batch_output_text)
+    path_score_dict = {}
+    for img_path, model_output in zip(image_paths, all_outputs):
+        try:
+            model_output_matches = re.findall(r'<answer>(.*?)</answer>', model_output, re.DOTALL)
+            model_answer = model_output_matches[-1].strip() if model_output_matches else model_output.strip()
+            score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
+        except:
+            print(f"Meet error with {img_path}, please generate again.")
+            score = random.randint(1, 5)
+        path_score_dict[img_path] = score
+    return path_score_dict
+random.seed(1)
+MODEL_PATH = ""
+device = torch.device("cuda:3") if torch.cuda.is_available() else torch.device("cpu")
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map=device,
+)
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+processor.tokenizer.padding_side = "left"
+image_root = ""
+image_paths = get_image_paths(image_root) # It should be a list
+path_score_dict = score_batch_image(
+    image_paths, model, processor
+)
+file_name = "output.txt"
+with open(file_name, "w") as file:
+    for key, value in path_score_dict.items():
+        file.write(f"{key} {value}\n")
+print("Done!")
+```
+</details>
+### Thinking mode for inference
+<details>
+<summary>Example Code (VisualQuality-R1: Single Image Quality Rating with thinking)</summary>
+```python
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+import torch
+import random
+import re
+import os
+def score_image(image_path, model, processor):
+    PROMPT = (
+        "You are doing the image quality assessment task. Here is the question: "
+        "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
+        "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality. "
+        "First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
+    )
     QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
+    # QUESTION_TEMPLATE = "Please describe the quality of this image."
     message = [
         {
             "role": "user",
             "content": [
+                {'type': 'image', 'image': image_path},
+                {"type": "text", "text": PROMPT}
             ],
         }
     ]
     inputs = inputs.to(device)
     # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=2048, do_sample=True, top_k=50, top_p=1)
     generated_ids_trimmed = [
         out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
     reasoning = re.findall(r'<think>(.*?)</think>', batch_output_text[0], re.DOTALL)
     reasoning = reasoning[-1].strip()
+    try:
+        model_output_matches = re.findall(r'<answer>(.*?)</answer>', batch_output_text[0], re.DOTALL)
+        model_answer = model_output_matches[-1].strip() if model_output_matches else batch_output_text[0].strip()
+        score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
+    except:
+        print(f"================= Meet error with {img_path}, please generate again. =================")
+        score = random.randint(1, 5)
     return reasoning, score
+random.seed(1)
+MODEL_PATH = ""
+device = torch.device("cuda:5") if torch.cuda.is_available() else torch.device("cpu")
 image_path = ""
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map=device,
+)
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+processor.tokenizer.padding_side = "left"
 reasoning, score = score_image(
+    image_path, model, processor
 )
 print(reasoning)
 print(score)
+```
+</details>
+<details>
+<summary>Example Code (VisualQuality-R1: Batch Images Quality Rating with thinking)</summary>
+```python
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+from tqdm import tqdm
+import torch
+import random
+import re
+import os
+def get_image_paths(folder_path):
+    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'}
+    image_paths = []
+    for root, dirs, files in os.walk(folder_path):
+        for file in files:
+            _, ext = os.path.splitext(file)
+            if ext.lower() in image_extensions:
+                image_paths.append(os.path.join(root, file))
+    return image_paths
+def score_batch_image(image_paths, model, processor):
+    PROMPT = (
+        "You are doing the image quality assessment task. Here is the question: "
+        "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
+        "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality."
+    )
+    QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
+    messages = []
+    for img_path in image_paths:
+        message = [
+            {
+                "role": "user",
+                "content": [
+                    {'type': 'image', 'image': img_path},
+                    {"type": "text", "text": QUESTION_TEMPLATE.format(Question=PROMPT)}
+                ],
+            }
+        ]
+        messages.append(message)
+    BSZ = 32
+    all_outputs = []  # List to store all answers
+    for i in tqdm(range(0, len(messages), BSZ)):
+        batch_messages = messages[i:i + BSZ]
+        # Preparation for inference
+        text = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in batch_messages]
+        image_inputs, video_inputs = process_vision_info(batch_messages)
+        inputs = processor(
+            text=text,
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(device)
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=512, do_sample=True, top_k=50, top_p=1)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        batch_output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        all_outputs.extend(batch_output_text)
+    path_score_dict = {}
+    for img_path, model_output in zip(image_paths, all_outputs):
+        reasoning = re.findall(r'<think>(.*?)</think>', model_output, re.DOTALL)
+        reasoning = reasoning[-1].strip()
+        try:
+            model_output_matches = re.findall(r'<answer>(.*?)</answer>', model_output, re.DOTALL)
+            model_answer = model_output_matches[-1].strip() if model_output_matches else model_output.strip()
+            score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
+        except:
+            print(f"Meet error with {img_path}, please generate again.")
+            score = random.randint(1, 5)
+        path_score_dict[img_path] = score
+    return path_score_dict
+random.seed(1)
+MODEL_PATH = ""
+device = torch.device("cuda:3") if torch.cuda.is_available() else torch.device("cpu")
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+    device_map=device,
+)
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+processor.tokenizer.padding_side = "left"
+image_root = ""
+image_paths = get_image_paths(image_root) # It should be a list
+path_score_dict = score_batch_image(
+    image_paths, model, processor
+)
+file_name = "output.txt"
+with open(file_name, "w") as file:
+    for key, value in path_score_dict.items():
+        file.write(f"{key} {value}\n")
+print("Done!")
+```
+</details>
+## 🚀 Updated: VisualQuality-R1 high efficiency inference script with vLLM
+<details>
+<summary>Example Code (VisualQuality-R1: Batch Images Quality Rating with thinking, using vLLM)</summary>
+```python
+# Please install vLLM first: https://docs.vllm.ai/en/stable/getting_started/installation/gpu.html
+from transformers import Qwen2_5_VLProcessor, AutoProcessor
+from vllm import LLM, RequestOutput, SamplingParams
+from qwen_vl_utils import process_vision_info
+import torch
+import random
+import re
+import os
+IMAGE_PATH = "./images"
+MODEL_PATH = "TianheWu/VisualQuality-R1-7B"
+def get_image_paths(folder_path):
+    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif', '.tiff', '.webp'}
+    image_paths = []
+    for root, dirs, files in os.walk(folder_path):
+        for file in files:
+            _, ext = os.path.splitext(file)
+            if ext.lower() in image_extensions:
+                image_paths.append(os.path.join(root, file))
+    return image_paths
+def score_batch_image(image_paths, model: LLM, processor: Qwen2_5_VLProcessor):
+    PROMPT = (
+        "You are doing the image quality assessment task. Here is the question: "
+        "What is your overall rating on the quality of this picture? The rating should be a float between 1 and 5, "
+        "rounded to two decimal places, with 1 representing very poor quality and 5 representing excellent quality."
+    )
+    QUESTION_TEMPLATE = "{Question} First output the thinking process in <think> </think> tags and then output the final answer with only one score in <answer> </answer> tags."
+    messages = []
+    for img_path in image_paths:
+        message = [
+            {
+                "role": "user",
+                "content": [
+                    {'type': 'image', 'image': img_path},
+                    {"type": "text", "text": QUESTION_TEMPLATE.format(Question=PROMPT)}
+                ],
+            }
+        ]
+        messages.append(message)
+    all_outputs = []  # List to store all answers
+    # Preparation for inference
+    print("preprocessing ...")
+    texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True, add_vision_id=True) for msg in messages]
+    image_inputs, video_inputs = process_vision_info(messages)
+    inputs = [{
+        "prompt": texts[i],
+        "multi_modal_data": {
+            "image": image_inputs[i]
+        },
+    } for i in range(len(messages))]
+    output: list[RequestOutput] = model.generate(
+        inputs,
+        sampling_params=SamplingParams(
+            max_tokens=512,
+            temperature=0.1,
+            top_k=50,
+            top_p=1.0,
+            stop_token_ids=[processor.tokenizer.eos_token_id],
+        ),
+    )
+    batch_output_text = [o.outputs[0].text for o in output]
+    all_outputs.extend(batch_output_text)
+    path_score_dict = {}
+    for img_path, model_output in zip(image_paths, all_outputs):
+        print(f"{model_output = }")
+        try:
+            model_output_matches = re.findall(r'<answer>(.*?)</answer>', model_output, re.DOTALL)
+            model_answer = model_output_matches[-1].strip() if model_output_matches else model_output.strip()
+            score = float(re.search(r'\d+(\.\d+)?', model_answer).group())
+        except:
+            print(f"Meet error with {img_path}, please generate again.")
+            score = random.randint(1, 5)
+        path_score_dict[img_path] = score
+    return path_score_dict
+random.seed(1)
+model = LLM(
+    model=MODEL_PATH,
+    tensor_parallel_size=1,
+    trust_remote_code=True,
+    seed=1,
+)
+processor = AutoProcessor.from_pretrained(MODEL_PATH)
+processor.tokenizer.padding_side = "left"
+image_paths = get_image_paths(IMAGE_PATH) # It should be a list
+path_score_dict = score_batch_image(
+    image_paths, model, processor
+)
+file_name = "output.txt"
+with open(file_name, "w") as file:
+    for key, value in path_score_dict.items():
+        file.write(f"{key} {value}\n")
+print("Done!")
+```
+</details>