|
import os |
|
import shutil |
|
import gradio as gr |
|
from huggingface_hub import HfApi, whoami, ModelCard, model_info |
|
from gradio_huggingfacehub_search import HuggingfaceHubSearch |
|
from textwrap import dedent |
|
from pathlib import Path |
|
|
|
from tempfile import TemporaryDirectory |
|
|
|
from huggingface_hub.file_download import repo_folder_name |
|
from optimum.exporters import TasksManager |
|
|
|
from optimum.intel.utils.modeling_utils import _find_files_matching_pattern |
|
from optimum.intel import ( |
|
OVModelForAudioClassification, |
|
OVModelForCausalLM, |
|
OVModelForFeatureExtraction, |
|
OVModelForImageClassification, |
|
OVModelForMaskedLM, |
|
OVModelForQuestionAnswering, |
|
OVModelForSeq2SeqLM, |
|
OVModelForSequenceClassification, |
|
OVModelForTokenClassification, |
|
OVStableDiffusionPipeline, |
|
OVStableDiffusionXLPipeline, |
|
OVLatentConsistencyModelPipeline, |
|
OVWeightQuantizationConfig, |
|
) |
|
from diffusers import ConfigMixin |
|
|
|
_HEAD_TO_AUTOMODELS = { |
|
"feature-extraction": "OVModelForFeatureExtraction", |
|
"fill-mask": "OVModelForMaskedLM", |
|
"text-generation": "OVModelForCausalLM", |
|
"text-classification": "OVModelForSequenceClassification", |
|
"token-classification": "OVModelForTokenClassification", |
|
"question-answering": "OVModelForQuestionAnswering", |
|
"image-classification": "OVModelForImageClassification", |
|
"audio-classification": "OVModelForAudioClassification", |
|
"stable-diffusion": "OVStableDiffusionPipeline", |
|
"stable-diffusion-xl": "OVStableDiffusionXLPipeline", |
|
"latent-consistency": "OVLatentConsistencyModelPipeline", |
|
} |
|
|
|
def quantize_model( |
|
model_id: str, |
|
dtype: str, |
|
calibration_dataset: str, |
|
ratio: str, |
|
private_repo: bool, |
|
overwritte: bool, |
|
oauth_token: gr.OAuthToken, |
|
): |
|
if oauth_token.token is None: |
|
return "You must be logged in to use this space" |
|
|
|
if not model_id: |
|
return f"### Invalid input 🐞 Please specify a model name, got {model_id}" |
|
|
|
try: |
|
model_name = model_id.split("/")[-1] |
|
username = whoami(oauth_token.token)["name"] |
|
suffix = f"{dtype}" if model_name.endswith("openvino") else f"openvino-{dtype}" |
|
new_repo_id = f"{username}/{model_name}-{suffix}" |
|
library_name = TasksManager.infer_library_from_model(model_id, token=oauth_token.token) |
|
|
|
if library_name == "diffusers": |
|
ConfigMixin.config_name = "model_index.json" |
|
class_name = ConfigMixin.load_config(model_id, token=oauth_token.token)["_class_name"].lower() |
|
if "xl" in class_name: |
|
task = "stable-diffusion-xl" |
|
elif "consistency" in class_name: |
|
task = "latent-consistency" |
|
else: |
|
task = "stable-diffusion" |
|
else: |
|
task = TasksManager.infer_task_from_model(model_id, token=oauth_token.token) |
|
|
|
if task == "text2text-generation": |
|
return "Export of Seq2Seq models is currently disabled." |
|
|
|
if task not in _HEAD_TO_AUTOMODELS: |
|
return f"The task '{task}' is not supported, only {_HEAD_TO_AUTOMODELS.keys()} tasks are supported" |
|
|
|
auto_model_class = _HEAD_TO_AUTOMODELS[task] |
|
ov_files = _find_files_matching_pattern( |
|
model_id, |
|
pattern=r"(.*)?openvino(.*)?\_model.xml", |
|
use_auth_token=oauth_token.token, |
|
) |
|
export = len(ov_files) == 0 |
|
|
|
if calibration_dataset == "None": |
|
calibration_dataset = None |
|
|
|
is_int8 = dtype == "int8" |
|
|
|
|
|
if not is_int8 and calibration_dataset is not None: |
|
quant_method = "awq" |
|
else: |
|
if calibration_dataset is not None: |
|
print("Default quantization was selected, calibration dataset won't be used") |
|
quant_method = "default" |
|
|
|
quantization_config = OVWeightQuantizationConfig( |
|
bits=8 if is_int8 else 4, |
|
quant_method=quant_method, |
|
dataset=None if quant_method=="default" else calibration_dataset, |
|
ratio=1.0 if is_int8 else ratio, |
|
num_samples=None if quant_method=="default" else 50, |
|
) |
|
|
|
api = HfApi(token=oauth_token.token) |
|
if api.repo_exists(new_repo_id) and not overwritte: |
|
return f"Model {new_repo_id} already exist, please set overwritte=True to push on an existing repository" |
|
|
|
with TemporaryDirectory() as d: |
|
folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models")) |
|
os.makedirs(folder) |
|
|
|
try: |
|
api.snapshot_download(repo_id=model_id, local_dir=folder, allow_patterns=["*.json"]) |
|
ov_model = eval(auto_model_class).from_pretrained( |
|
model_id, |
|
export=export, |
|
cache_dir=folder, |
|
token=oauth_token.token, |
|
quantization_config=quantization_config |
|
) |
|
ov_model.save_pretrained(folder) |
|
new_repo_url = api.create_repo(repo_id=new_repo_id, exist_ok=True, private=private_repo) |
|
new_repo_id = new_repo_url.repo_id |
|
print("Repository created successfully!", new_repo_url) |
|
|
|
folder = Path(folder) |
|
for dir_name in ( |
|
"", |
|
"vae_encoder", |
|
"vae_decoder", |
|
"text_encoder", |
|
"text_encoder_2", |
|
"unet", |
|
"tokenizer", |
|
"tokenizer_2", |
|
"scheduler", |
|
"feature_extractor", |
|
): |
|
if not (folder / dir_name).is_dir(): |
|
continue |
|
for file_path in (folder / dir_name).iterdir(): |
|
if file_path.is_file(): |
|
try: |
|
api.upload_file( |
|
path_or_fileobj=file_path, |
|
path_in_repo=os.path.join(dir_name, file_path.name), |
|
repo_id=new_repo_id, |
|
) |
|
except Exception as e: |
|
return f"Error uploading file {file_path}: {e}" |
|
|
|
try: |
|
card = ModelCard.load(model_id, token=oauth_token.token) |
|
except: |
|
card = ModelCard("") |
|
|
|
if card.data.tags is None: |
|
card.data.tags = [] |
|
card.data.tags.append("openvino") |
|
card.data.base_model = model_id |
|
card.text = dedent( |
|
f""" |
|
This model is a quantized version of [`{model_id}`](https://huggingface.co/{model_id}) and is converted to the OpenVINO format. This model was obtained via the [nncf-quantization](https://huggingface.co/spaces/echarlaix/nncf-quantization) space with [optimum-intel](https://github.com/huggingface/optimum-intel). |
|
|
|
First make sure you have `optimum-intel` installed: |
|
|
|
```bash |
|
pip install optimum[openvino] |
|
``` |
|
|
|
To load your model you can do as follows: |
|
|
|
```python |
|
from optimum.intel import {auto_model_class} |
|
|
|
model_id = "{new_repo_id}" |
|
model = {auto_model_class}.from_pretrained(model_id) |
|
``` |
|
""" |
|
) |
|
card_path = os.path.join(folder, "README.md") |
|
card.save(card_path) |
|
|
|
api.upload_file( |
|
path_or_fileobj=card_path, |
|
path_in_repo="README.md", |
|
repo_id=new_repo_id, |
|
) |
|
return f"This model was successfully quantized, find it under your repository {new_repo_url}" |
|
finally: |
|
shutil.rmtree(folder, ignore_errors=True) |
|
except Exception as e: |
|
return f"### Error: {e}" |
|
|
|
DESCRIPTION = """ |
|
This Space uses [Optimum Intel](https://github.com/huggingface/optimum-intel) to automatically apply NNCF [Weight Only Quantization](https://huggingface.co/docs/optimum/main/en/intel/openvino/optimization) (WOQ) on your model and convert it to the [OpenVINO format](https://docs.openvino.ai/2024/documentation/openvino-ir-format.html) if not already. |
|
|
|
After conversion, a repository will be pushed under your namespace with the resulting model. |
|
|
|
The list of the supported architectures can be found in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/models) |
|
""" |
|
|
|
model_id = HuggingfaceHubSearch( |
|
label="Hub Model ID", |
|
placeholder="Search for model id on the hub", |
|
search_type="model", |
|
) |
|
dtype = gr.Dropdown( |
|
["8-bit", "4-bit"], |
|
value="8-bit", |
|
label="Weights precision", |
|
filterable=False, |
|
visible=True, |
|
) |
|
""" |
|
quant_method = gr.Dropdown( |
|
["default", "awq", "hybrid"], |
|
value="default", |
|
label="Quantization method", |
|
filterable=False, |
|
visible=True, |
|
) |
|
""" |
|
calibration_dataset = gr.Dropdown( |
|
[ |
|
"None", |
|
"wikitext2", |
|
"c4", |
|
"c4-new", |
|
"conceptual_captions", |
|
"laion/220k-GPT4Vision-captions-from-LIVIS", |
|
"laion/filtered-wit", |
|
], |
|
value="None", |
|
label="Calibration dataset", |
|
filterable=False, |
|
visible=True, |
|
) |
|
ratio = gr.Slider( |
|
label="Ratio", |
|
info="Parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization", |
|
minimum=0.0, |
|
maximum=1.0, |
|
step=0.1, |
|
value=1.0, |
|
) |
|
private_repo = gr.Checkbox( |
|
value=False, |
|
label="Private repository", |
|
info="Create a private repository instead of a public one", |
|
) |
|
overwritte = gr.Checkbox( |
|
value=False, |
|
label="Overwrite repository content", |
|
info="Enable pushing files on existing repositories, potentially overwriting existing files", |
|
) |
|
interface = gr.Interface( |
|
fn=quantize_model, |
|
inputs=[ |
|
model_id, |
|
dtype, |
|
calibration_dataset, |
|
ratio, |
|
private_repo, |
|
overwritte, |
|
], |
|
outputs=[ |
|
gr.Markdown(label="output"), |
|
], |
|
title="Quantize your model with NNCF", |
|
description=DESCRIPTION, |
|
api_name=False, |
|
) |
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("You must be logged in to use this space") |
|
gr.LoginButton(min_width=250) |
|
interface.render() |
|
|
|
demo.launch() |
|
|