Spaces:

OpenVINO
/

nncf-quantization

Running

App Files Files Community

nncf-quantization / app.py

echarlaix HF Staff

remove need to check whether the model needs to be exported

9a0c2c8 19 days ago

raw

history blame

10.4 kB

	import os
	import shutil
	import gradio as gr
	from huggingface_hub import HfApi, whoami, ModelCard, model_info
	from gradio_huggingfacehub_search import HuggingfaceHubSearch
	from textwrap import dedent
	from pathlib import Path

	from tempfile import TemporaryDirectory

	from huggingface_hub.file_download import repo_folder_name
	from optimum.exporters import TasksManager
	from optimum.intel import (
	OVModelForAudioClassification,
	OVModelForCausalLM,
	OVModelForFeatureExtraction,
	OVModelForImageClassification,
	OVModelForMaskedLM,
	OVModelForQuestionAnswering,
	OVModelForSeq2SeqLM,
	OVModelForSequenceClassification,
	OVModelForTokenClassification,
	OVStableDiffusionPipeline,
	OVStableDiffusionXLPipeline,
	OVLatentConsistencyModelPipeline,
	OVWeightQuantizationConfig,
	)
	from diffusers import ConfigMixin

	_HEAD_TO_AUTOMODELS = {
	"feature-extraction": "OVModelForFeatureExtraction",
	"fill-mask": "OVModelForMaskedLM",
	"text-generation": "OVModelForCausalLM",
	"text-classification": "OVModelForSequenceClassification",
	"token-classification": "OVModelForTokenClassification",
	"question-answering": "OVModelForQuestionAnswering",
	"image-classification": "OVModelForImageClassification",
	"audio-classification": "OVModelForAudioClassification",
	"stable-diffusion": "OVStableDiffusionPipeline",
	"stable-diffusion-xl": "OVStableDiffusionXLPipeline",
	"latent-consistency": "OVLatentConsistencyModelPipeline",
	}

	def quantize_model(
	model_id: str,
	dtype: str,
	calibration_dataset: str,
	ratio: str,
	private_repo: bool,
	overwritte: bool,
	oauth_token: gr.OAuthToken,
	):
	if oauth_token.token is None:
	return "You must be logged in to use this space"

	if not model_id:
	return f"### Invalid input 🐞 Please specify a model name, got {model_id}"

	try:
	model_name = model_id.split("/")[-1]
	username = whoami(oauth_token.token)["name"]
	w_t = dtype.replace("-", "")
	suffix = f"{w_t}" if model_name.endswith("openvino") else f"openvino-{w_t}"
	new_repo_id = f"{username}/{model_name}-{suffix}"
	library_name = TasksManager.infer_library_from_model(model_id, token=oauth_token.token)

	if library_name == "diffusers":
	ConfigMixin.config_name = "model_index.json"
	class_name = ConfigMixin.load_config(model_id, token=oauth_token.token)["_class_name"].lower()
	if "xl" in class_name:
	task = "stable-diffusion-xl"
	elif "consistency" in class_name:
	task = "latent-consistency"
	else:
	task = "stable-diffusion"
	else:
	task = TasksManager.infer_task_from_model(model_id, token=oauth_token.token)

	if task == "text2text-generation":
	return "Export of Seq2Seq models is currently disabled."

	if task not in _HEAD_TO_AUTOMODELS:
	return f"The task '{task}' is not supported, only {_HEAD_TO_AUTOMODELS.keys()} tasks are supported"

	auto_model_class = _HEAD_TO_AUTOMODELS[task]
	if calibration_dataset == "None":
	calibration_dataset = None

	is_int8 = dtype == "8-bit"
	# if library_name == "diffusers":
	# quant_method = "hybrid"
	if not is_int8 and calibration_dataset is not None:
	quant_method = "awq"
	else:
	if calibration_dataset is not None:
	print("Default quantization was selected, calibration dataset won't be used")
	quant_method = "default"

	quantization_config = OVWeightQuantizationConfig(
	bits=8 if is_int8 else 4,
	quant_method=quant_method,
	dataset=None if quant_method=="default" else calibration_dataset,
	ratio=1.0 if is_int8 else ratio,
	num_samples=None if quant_method=="default" else 20,
	)

	api = HfApi(token=oauth_token.token)
	if api.repo_exists(new_repo_id) and not overwritte:
	return f"Model {new_repo_id} already exist, please tick the overwritte box to push on an existing repository"

	with TemporaryDirectory() as d:
	folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
	os.makedirs(folder)

	try:
	api.snapshot_download(repo_id=model_id, local_dir=folder, allow_patterns=["*.json"])
	ov_model = eval(auto_model_class).from_pretrained(
	model_id,
	cache_dir=folder,
	token=oauth_token.token,
	quantization_config=quantization_config
	)
	ov_model.save_pretrained(folder)
	new_repo_url = api.create_repo(repo_id=new_repo_id, exist_ok=True, private=private_repo)
	new_repo_id = new_repo_url.repo_id
	print("Repository created successfully!", new_repo_url)

	folder = Path(folder)
	for dir_name in (
	"",
	"vae_encoder",
	"vae_decoder",
	"text_encoder",
	"text_encoder_2",
	"unet",
	"tokenizer",
	"tokenizer_2",
	"scheduler",
	"feature_extractor",
	):
	if not (folder / dir_name).is_dir():
	continue
	for file_path in (folder / dir_name).iterdir():
	if file_path.is_file():
	try:
	api.upload_file(
	path_or_fileobj=file_path,
	path_in_repo=os.path.join(dir_name, file_path.name),
	repo_id=new_repo_id,
	)
	except Exception as e:
	return f"Error uploading file {file_path}: {e}"

	try:
	card = ModelCard.load(model_id, token=oauth_token.token)
	except:
	card = ModelCard("")

	if card.data.tags is None:
	card.data.tags = []
	if "openvino" not in card.data.tags:
	card.data.tags.append("openvino")
	card.data.tags.append("nncf")
	card.data.tags.append(dtype)
	card.data.base_model = model_id

	card.text = dedent(
	f"""
	This model is a quantized version of [`{model_id}`](https://huggingface.co/{model_id}) and is converted to the OpenVINO format. This model was obtained via the [nncf-quantization](https://huggingface.co/spaces/echarlaix/nncf-quantization) space with [optimum-intel](https://github.com/huggingface/optimum-intel).

	First make sure you have `optimum-intel` installed:

	```bash
	pip install optimum[openvino]
	```

	To load your model you can do as follows:

	```python
	from optimum.intel import {auto_model_class}

	model_id = "{new_repo_id}"
	model = {auto_model_class}.from_pretrained(model_id)
	```
	"""
	)
	card_path = os.path.join(folder, "README.md")
	card.save(card_path)

	api.upload_file(
	path_or_fileobj=card_path,
	path_in_repo="README.md",
	repo_id=new_repo_id,
	)
	return f"This model was successfully quantized, find it under your repository {new_repo_url}"
	finally:
	shutil.rmtree(folder, ignore_errors=True)
	except Exception as e:
	return f"### Error: {e}"

	DESCRIPTION = """
	This Space uses [Optimum Intel](https://github.com/huggingface/optimum-intel) to automatically apply NNCF [Weight Only Quantization](https://huggingface.co/docs/optimum/main/en/intel/openvino/optimization) (WOQ) on your model and convert it to the [OpenVINO format](https://docs.openvino.ai/2024/documentation/openvino-ir-format.html) if not already.

	After conversion, a repository will be pushed under your namespace with the resulting model.

	The list of the supported architectures can be found in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/openvino/models)
	"""

	model_id = HuggingfaceHubSearch(
	label="Hub Model ID",
	placeholder="Search for model id on the hub",
	search_type="model",
	)
	dtype = gr.Dropdown(
	["8-bit", "4-bit"],
	value="8-bit",
	label="Weights precision",
	filterable=False,
	visible=True,
	)
	"""
	quant_method = gr.Dropdown(
	["default", "awq", "hybrid"],
	value="default",
	label="Quantization method",
	filterable=False,
	visible=True,
	)
	"""
	calibration_dataset = gr.Dropdown(
	[
	"None",
	"wikitext2",
	"c4",
	"c4-new",
	"conceptual_captions",
	"laion/220k-GPT4Vision-captions-from-LIVIS",
	"laion/filtered-wit",
	],
	value="None",
	label="Calibration dataset",
	filterable=False,
	visible=True,
	)
	ratio = gr.Slider(
	label="Ratio",
	info="Parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization",
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=1.0,
	)
	private_repo = gr.Checkbox(
	value=False,
	label="Private repository",
	info="Create a private repository instead of a public one",
	)
	overwritte = gr.Checkbox(
	value=False,
	label="Overwrite repository content",
	info="Enable pushing files on existing repositories, potentially overwriting existing files",
	)
	interface = gr.Interface(
	fn=quantize_model,
	inputs=[
	model_id,
	dtype,
	calibration_dataset,
	ratio,
	private_repo,
	overwritte,
	],
	outputs=[
	gr.Markdown(label="output"),
	],
	title="Quantize your model with NNCF",
	description=DESCRIPTION,
	api_name=False,
	)

	with gr.Blocks() as demo:
	gr.Markdown("You must be logged in to use this space")
	gr.LoginButton(min_width=250)
	interface.render()

	demo.launch()