Quantized with GPTQModel 4.0.0 dev with the following code:

quantization code
import base64
from io import BytesIO
from random import seed, shuffle

from datasets import concatenate_datasets, load_dataset
from gptqmodel import GPTQModel, QuantizeConfig
from transformers import AutoTokenizer

seed(0)

MODEL_ID = "XiaomiMiMo/MiMo-VL-7B-RL-2508"
SAVE_DIR = "MiMo-VL-7B-RL-2508-gptq-q4"

NUM_TEXT_SAMPLES = 128
NUM_IMAGE_SAMPLES = 128
MAX_TOKENS = 1024


def encode_pil_to_data_uri(pil_image) -> str:
    buff = BytesIO()
    pil_image.save(buff, format="PNG")
    encoded = base64.b64encode(buff.getvalue()).decode("utf-8")
    return f"data:image;base64,{encoded}"


def make_text_conversations(texts, tok, max_tokens=1024):
    convs = []
    for t in texts:
        if not isinstance(t, str):
            continue
        tt = t.strip()
        if not tt:
            continue
        ids = tok.encode(tt, add_special_tokens=False)[:max_tokens]
        if not ids:
            continue
        trunc = tok.decode(ids, skip_special_tokens=True)
        convs.append(
            [
                {
                    "role": "user",
                    "content": [{"type": "text", "text": trunc}],
                }
            ]
        )
    return convs


def make_image_conversations(hf_dataset, num_samples=64):
    convs = []
    for ex in hf_dataset.select(range(min(num_samples, len(hf_dataset)))):
        data_uri = encode_pil_to_data_uri(ex["image"])
        convs.append(
            [
                {
                    "role": "user",
                    "content": [
                        {"type": "image", "image": data_uri},
                        {"type": "text", "text": "What does the image show?"},
                    ],
                }
            ]
        )
    return convs


en_ds = load_dataset(
    "allenai/c4", data_files="en/c4-train.00001-of-01024.json.gz", split="train"
).shuffle(seed=0)
es_ds = load_dataset(
    "allenai/c4", data_files="multilingual/c4-es.tfrecord-00001-of-02048.json.gz", split="train"
).shuffle(seed=0)

texts = [x["text"] for x in concatenate_datasets([en_ds, es_ds])]
texts = [t for t in texts if isinstance(t, str) and t.strip()]
shuffle(texts)
texts = texts[:NUM_TEXT_SAMPLES]

tok = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)
text_conversations = make_text_conversations(texts, tok, max_tokens=MAX_TOKENS)

img_ds = load_dataset("lmms-lab/flickr30k", split="test[:512]").shuffle(seed=42)
image_conversations = make_image_conversations(img_ds, num_samples=NUM_IMAGE_SAMPLES)

calibration_conversations = text_conversations + image_conversations
shuffle(calibration_conversations)

print(
    f"Prepared {len(text_conversations)} text-only and "
    f"{len(image_conversations)} image+text conversations "
    f"(total {len(calibration_conversations)})."
)

qconf = QuantizeConfig(
    bits=4,
    group_size=128,
    device="cuda:0",
    v2=False,  # v2 is giving much worse results
)

model = GPTQModel.load(MODEL_ID, qconf)

model.quantize(
    calibration_conversations,
    batch_size=1,
)

model.save(SAVE_DIR)
print(f"Saved quantized model to: {SAVE_DIR}")
Downloads last month
112
Safetensors
Model size
2.78B params
Tensor type
I32
BF16
F16
Inference Providers NEW
This model isn't deployed by any Inference Provider. 馃檵 Ask for provider support

Model tree for mediainbox/MiMo-VL-7B-RL-2508-gptq-q4

Unable to build the model tree, the base model loops to the model itself. Learn more.