Spaces:

OpenVINO
/

nncf-quantization

Running

App Files Files Community

nncf-quantization / app.py

echarlaix HF Staff

fix quantized model saving

872b151 over 1 year ago

raw

history blame

8.66 kB

	import os
	import shutil
	import gradio as gr
	from huggingface_hub import HfApi, whoami, ModelCard
	from gradio_huggingfacehub_search import HuggingfaceHubSearch
	from textwrap import dedent
	from pathlib import Path

	from tempfile import TemporaryDirectory

	from huggingface_hub.file_download import repo_folder_name
	from optimum.exporters.tasks import TasksManager
	from optimum.intel.utils.constant import _TASK_ALIASES
	from optimum.intel.openvino.utils import _HEAD_TO_AUTOMODELS
	from optimum.exporters import TasksManager

	from optimum.intel.utils.modeling_utils import _find_files_matching_pattern
	from optimum.intel import (
	OVModelForAudioClassification,
	OVModelForCausalLM,
	OVModelForFeatureExtraction,
	OVModelForImageClassification,
	OVModelForMaskedLM,
	OVModelForQuestionAnswering,
	OVModelForSeq2SeqLM,
	OVModelForSequenceClassification,
	OVModelForTokenClassification,
	OVStableDiffusionPipeline,
	OVStableDiffusionXLPipeline,
	OVLatentConsistencyModelPipeline,
	OVModelForPix2Struct,
	OVWeightQuantizationConfig,
	)

	def quantize_model(
	model_id: str,
	dtype: str,
	calibration_dataset: str,
	ratio: str,
	private_repo: bool,
	overwritte: bool,
	oauth_token: gr.OAuthToken,
	):
	if oauth_token.token is None:
	raise ValueError("You must be logged in to use this space")

	model_name = model_id.split("/")[-1]
	username = whoami(oauth_token.token)["name"]
	new_repo_id = f"{username}/{model_name}-openvino-{dtype}"
	task = TasksManager.infer_task_from_model(model_id)
	library_name = TasksManager.infer_library_from_model(model_id)
	# task = TasksManager.infer_task_from_model(model_id, token=oauth_token.token)
	# library_name = TasksManager.infer_library_from_model(model_id, token=oauth_token.token)

	if task not in _HEAD_TO_AUTOMODELS:
	raise ValueError(
	f"The task '{task}' is not supported, only {_HEAD_TO_AUTOMODELS.keys()} tasks are supported"
	)

	if task == "text2text-generation":
	raise ValueError("Export of Seq2Seq models is currently disabled.")

	auto_model_class = _HEAD_TO_AUTOMODELS[task]
	ov_files = _find_files_matching_pattern(
	model_id,
	pattern=r"(.)?openvino(.)?\_model.xml",
	use_auth_token=oauth_token.token,
	)
	export = len(ov_files) == 0

	is_int8 = dtype == "int8"
	if library_name == "diffusers":
	quant_method = "hybrid"
	elif not is_int8:
	quant_method = "awq"
	else:
	quant_method = "default"

	quantization_config = OVWeightQuantizationConfig(
	bits=8 if is_int8 else 4,
	quant_method=quant_method,
	dataset=None if quant_method=="default" else calibration_dataset,
	ratio=1.0 if is_int8 else ratio,
	)

	api = HfApi(token=oauth_token.token)
	if api.repo_exists(new_repo_id) and not overwritte:
	raise Exception(f"Model {new_repo_id} already exist, please set overwritte=True to push on an existing repo")

	with TemporaryDirectory() as d:
	folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
	os.makedirs(folder)

	try:
	api.snapshot_download(repo_id=model_id, local_dir=folder, allow_patterns=["*.json"])
	ov_model = eval(auto_model_class).from_pretrained(
	model_id,
	export=export,
	cache_dir=folder,
	token=oauth_token.token,
	quantization_config=quantization_config
	)
	ov_model.save_pretrained(folder)
	new_repo_url = api.create_repo(repo_id=new_repo_id, exist_ok=True, private=private_repo)
	new_repo_id = new_repo_url.repo_id
	print("Repo created successfully!", new_repo_url)

	folder = Path(folder)
	for dir_name in (
	"",
	"vae_encoder",
	"vae_decoder",
	"text_encoder",
	"text_encoder_2",
	"unet",
	"tokenizer",
	"tokenizer_2",
	"scheduler",
	"feature_extractor",
	):
	if not (folder / dir_name).is_dir():
	continue
	for file_path in (folder / dir_name).iterdir():
	if file_path.is_file():
	try:
	api.upload_file(
	path_or_fileobj=file_path,
	path_in_repo=os.path.join(dir_name, file_path.name),
	repo_id=new_repo_id,
	)
	except Exception as e:
	raise Exception(f"Error uploading file {file_path}: {e}")

	try:
	card = ModelCard.load(model_id, token=oauth_token.token)
	except:
	card = ModelCard("")

	if card.data.tags is None:
	card.data.tags = []
	card.data.tags.append("openvino")
	card.data.base_model = model_id
	card.text = dedent(
	f"""
	This model is a quantized version of [`{model_id}`](https://huggingface.co/{model_id}) and was exported to the OpenVINO format using [optimum-intel](https://github.com/huggingface/optimum-intel) via the [nncf-quantization](https://huggingface.co/spaces/echarlaix/nncf-quantization) space.

	First make sure you have optimum-intel installed:

	```bash
	pip install optimum[openvino]
	```

	To load your model you can do as follows:

	```python
	from optimum.intel import {auto_model_class}

	model_id = "{new_repo_id}"
	model = {auto_model_class}.from_pretrained(model_id)
	```
	"""
	)
	card_path = os.path.join(folder, "README.md")
	card.save(card_path)

	api.upload_file(
	path_or_fileobj=card_path,
	path_in_repo="README.md",
	repo_id=new_repo_id,
	)
	return f"This model was successfully quantized, find it under your repo {new_repo_url}'"
	finally:
	shutil.rmtree(folder, ignore_errors=True)


	DESCRIPTION = """
	This Space uses [Optimum Intel](https://huggingface.co/docs/optimum/main/en/intel/openvino/optimization) to automatically apply NNCF weight only quantization on a model hosted on the [Hub](https://huggingface.co/models) and convert it to the [OpenVINO format](https://docs.openvino.ai/2024/documentation/openvino-ir-format.html) if not already.

	The resulting model will then be pushed under your HF user namespace. For now we only support conversion for models that are hosted on public repositories.
	"""

	model_id = HuggingfaceHubSearch(
	label="Hub Model ID",
	placeholder="Search for model id on the hub",
	search_type="model",
	)
	dtype = gr.Dropdown(
	["int8", "int4"],
	value="int8",
	label="Precision data types",
	filterable=False,
	visible=True,
	)
	"""
	quant_method = gr.Dropdown(
	["default", "awq", "hybrid"],
	value="default",
	label="Quantization method",
	filterable=False,
	visible=True,
	)
	"""
	calibration_dataset = gr.Dropdown(
	[
	"wikitext2",
	"c4",
	"c4-new",
	"conceptual_captions",
	"laion/220k-GPT4Vision-captions-from-LIVIS",
	"laion/filtered-wit",
	],
	value="wikitext2",
	label="Calibration dataset",
	filterable=False,
	visible=True,
	)
	ratio = gr.Slider(
	label="Ratio",
	info="Parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization",
	minimum=0.0,
	maximum=1.0,
	step=0.1,
	value=1.0,
	)
	private_repo = gr.Checkbox(
	value=False,
	label="Private Repo",
	info="Create a private repo under your username",
	)
	overwritte = gr.Checkbox(
	value=False,
	label="Overwrite repo content",
	info="Push files on existing repo potentially overwriting existing files",
	)
	interface = gr.Interface(
	fn=quantize_model,
	inputs=[
	model_id,
	dtype,
	calibration_dataset,
	ratio,
	private_repo,
	overwritte,
	],
	outputs=[
	gr.Markdown(label="output"),
	],
	title="Quantize your model with NNCF",
	description=DESCRIPTION,
	api_name=False,
	)

	with gr.Blocks() as demo:
	gr.Markdown("You must be logged in to use this space")
	gr.LoginButton(min_width=250)
	interface.render()

	demo.launch()