mediainbox/MiMo-VL-7B-RL-2508-gptq-q4

Quantized with GPTQModel 4.0.0 dev with the following code:
quantization code
import base64
from io import BytesIO
from random import seed, shuffle

from datasets import concatenate_datasets, load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer

seed(0)

MODEL_ID = "XiaomiMiMo/MiMo-VL-7B-RL-2508"
SAVE_DIR = "MiMo-VL-7B-RL-2508-gptq-q4"

NUM_TEXT_SAMPLES = 128
NUM_IMAGE_SAMPLES = 128
MAX_TOKENS = 1024


def encode_pil_to_data_uri(pil_image) -> str:
    buff = BytesIO()
    pil_image.save(buff, format="PNG")
    encoded = base64.b64encode(buff.getvalue()).decode("utf-8")
    return f"data:image;base64,{encoded}"


def make_text_conversations(texts, tok, max_tokens=1024):
    convs = []
    for t in texts:
        if not isinstance(t, str):
            continue
        tt = t.strip()
        if not tt:
            continue
        ids = tok.encode(tt, add_special_tokens=False)[:max_tokens]
        if not ids:
            continue
        trunc = tok.decode(ids, skip_special_tokens=True)
        convs.append(
            [
                {
                    "role": "user",
                    "content": [{"type": "text", "text": trunc}],
                }
            ]
        )
    return convs


def make_image_conversations(hf_dataset, num_samples=64):
    convs = []
    for ex in hf_dataset.select(range(min(num_samples, len(hf_dataset)))):
        data_uri = encode_pil_to_data_uri(ex["image"])
        convs.append(
            [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": data_uri},
                        {"type": "text", "text": "What does the image show?"},
                    ],
                }
            ]
        )
    return convs


en_ds = load_dataset(
    "allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split="train"
).shuffle(seed=0)
es_ds = load_dataset(
    "allenai/c4", data_files="multilingual/c4-es.tfrecord-00001-of-02048.json.gz", split="train"
).shuffle(seed=0)

texts = [x["text"] for x in concatenate_datasets([en_ds, es_ds])]
texts = [t for t in texts if isinstance(t, str) and t.strip()]
shuffle(texts)
texts = texts[:NUM_TEXT_SAMPLES]

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
text_conversations = make_text_conversations(texts, tok, max_tokens=MAX_TOKENS)

img_ds = load_dataset("lmms-lab/flickr30k", split="test[:512]").shuffle(seed=42)
image_conversations = make_image_conversations(img_ds, num_samples=NUM_IMAGE_SAMPLES)

calibration_conversations = text_conversations + image_conversations
shuffle(calibration_conversations)

print(
    f"Prepared {len(text_conversations)} text-only and "
    f"{len(image_conversations)} image+text conversations "
    f"(total {len(calibration_conversations)})."
)

qconf = QuantizeConfig(
    bits=4,
    group_size=128,
    device="cuda:0",
    v2=False,  # v2 is giving much worse results
)

model = GPTQModel.load(MODEL_ID, qconf)

model.quantize(
    calibration_conversations,
    batch_size=1,
)

model.save(SAVE_DIR)
print(f"Saved quantized model to: {SAVE_DIR}")
mediainbox
/

MiMo-VL-7B-RL-2508-gptq-q4

Model tree for mediainbox/MiMo-VL-7B-RL-2508-gptq-q4