jimmycarter/qwen-image-int8-quanto

Use like:
CACHE_ROOT        = pathlib.Path("qwen-image-int8-quanto ")   # where we store INT8 modules
TRANSFORMER_DIR   = CACHE_ROOT / "qwen_image_transformer_int8"
TEXT_ENCODER_DIR  = CACHE_ROOT / "qwen_text_encoder_int8"

def load_quantized_modules(transformer_dir: pathlib.Path, text_encoder_dir: pathlib.Path):
    """
    Load quantized modules (we saved them with the exact filenames the loaders expect).
    """
    tr = torch.load(str(transformer_dir / 'pytorch_model.bin'), weights_only=False)
    te = torch.load(str(text_encoder_dir / 'pytorch_model.bin'), weights_only=False)
    return tr, te

def build_pipe(cls, transformer_dir: pathlib.Path, text_encoder_dir: pathlib.Path):
    """
    Build a pipeline of class `cls` by loading the quantized modules from disk.
    Fresh module instances each time avoids offload-hook/state reuse hangs.
    """
    transformer, text_encoder = load_quantized_modules(transformer_dir, text_encoder_dir)
    pipe = cls.from_pretrained(
        BASE_MODEL_ID,
        transformer=transformer,
        text_encoder=text_encoder,
        torch_dtype=torch.bfloat16,
        use_safetensors=True,
        low_cpu_mem_usage=True,
    )
    pipe.enable_model_cpu_offload()
    pipe.set_progress_bar_config(disable=False)
    return pipe