From 056ed5810b0c9135683e6f79e527b959c9b5fd48 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Mon, 23 Feb 2026 17:53:58 +0400 Subject: [PATCH 01/16] init --- .../openvino/stable_diffusion/export_lcm.py | 141 +++++++++++++++++- 1 file changed, 135 insertions(+), 6 deletions(-) diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index d0d678c0b75..32489be0a31 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -13,12 +13,17 @@ import torch from executorch.backends.openvino.partitioner import OpenvinoPartitioner +from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode, quantize_model from executorch.examples.models.stable_diffusion.model import ( # type: ignore[import-untyped] LCMModelLoader, ) from executorch.exir import ExecutorchBackendConfig, to_edge_transform_and_lower from executorch.exir.backend.backend_details import CompileSpec from torch.export import export +from torchao.quantization.pt2e.quantizer.quantizer import Quantizer +import nncf +import datasets +from tqdm import tqdm # Configure logging logging.basicConfig(level=logging.INFO) @@ -31,18 +36,117 @@ class LCMOpenVINOExporter: def __init__( self, model_id: str = "SimianLuo/LCM_Dreamshaper_v7", + is_quantization_enabled = False, dtype: torch.dtype = torch.float16, ): + if(is_quantization_enabled): + logger.warning("Quantization requires float32, overriding dtype from float16 to float32.") + dtype = torch.float32 + self.is_quantization_enabled = is_quantization_enabled self.model_loader = LCMModelLoader(model_id=model_id, dtype=dtype) def load_models(self) -> bool: """Load the LCM pipeline and extract components""" return self.model_loader.load_models() + def should_quantize_model(self, sd_model_type): + """ + If this is true, then we should quantize activations and weights. Otherwise, only compress the weights. + + :param sd_model_type: the type of model in the stable diffusion pipeline such as Unet, text encoder, VAE etc. + """ + return sd_model_type == "unet" + + def get_ov_quantizer(self, sd_model_type: str) -> Quantizer: + quantization_mode = QuantizationMode.INT8WO_ASYM + if self.should_quantize_model(sd_model_type): + # Only Unet model will have both weights and activations quantized. + quantization_mode = QuantizationMode.INT8_TRANSFORMER + + quantizer = OpenVINOQuantizer(mode=quantization_mode) + return quantizer + + def get_unet_calibration_dataset(self, calibration_dataset_size=200, num_inference_steps=4): + class UNetWrapper(torch.nn.Module): + def __init__(self, model, config): + super().__init__() + self.model = model + self.config = config + self.captured_args = [] + + def _pick(self, name: str, args, kwargs, idx: int): + if name in kwargs and kwargs[name] is not None: + return kwargs[name] + if len(args) > idx: + return args[idx] + raise KeyError(f"Missing required UNet input: {name}") + + def forward(self, *args, **kwargs): + """ + obtain and pass each input individually to ensure the order is maintained + and the right values are being passed according to the expected inputs by + the OpenVINO LCM runner. + """ + sample = self._pick("sample", args, kwargs, 0) + timestep = self._pick("timestep", args, kwargs, 1) + encoder_hidden_states = self._pick("encoder_hidden_states", args, kwargs, 2) + timestep = timestep.unsqueeze(0) if len(timestep.shape) == 0 and isinstance(timestep, torch.Tensor) else timestep + unet_args = (sample, timestep, encoder_hidden_states,) + self.captured_args.append(unet_args) + return self.model(*unet_args) + + pipeline = self.model_loader.pipeline + calibration_data = [] + dataset = datasets.load_dataset( + "google-research-datasets/conceptual_captions", + split="train", + trust_remote_code=True, + ).shuffle(seed=42) + + wrapped_unet = UNetWrapper(pipeline.unet, pipeline.unet.config) + original_unet = pipeline.unet + pipeline.unet = wrapped_unet + # Run inference for data collection + pbar = tqdm(total=calibration_dataset_size) + for batch in dataset: + prompt = batch["caption"] + if len(prompt.split()) > pipeline.tokenizer.model_max_length: + continue + # Run the pipeline + image = pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512) + calibration_data.extend(wrapped_unet.captured_args) + wrapped_unet.captured_args = [] + pbar.update(len(calibration_data) - pbar.n) + if pbar.n >= calibration_dataset_size: + break + pipeline.unet = original_unet + return calibration_data + + def maybe_quantize_model(self, model, sd_model_type, is_quantization_enabled): + if(not is_quantization_enabled): + return model + quantized_model = model + ov_quantizer = self.get_ov_quantizer(sd_model_type) + if self.should_quantize_model(sd_model_type): + # Quantize activations for the Unet Model. Other models are weights-only quantized. + calibration_dataset = self.get_unet_calibration_dataset() + from nncf.quantization.range_estimator import RangeEstimatorParametersSet + quantized_model = quantize_model(model, mode=QuantizationMode.INT8_TRANSFORMER, + calibration_dataset=calibration_dataset, + smooth_quant=True, + activations_range_estimator_params=RangeEstimatorParametersSet.MINMAX, + weights_range_estimator_params=RangeEstimatorParametersSet.MINMAX) + else: + quantized_model = nncf.experimental.torch.fx.compress_pt2e(model, quantizer=ov_quantizer) + return quantized_model + def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: """Export CLIP text encoder to PTE file""" try: logger.info("Exporting text encoder with OpenVINO backend...") + + sd_model_type = "text_encoder" + is_quantization_enabled = self.is_quantization_enabled # Get wrapped model and dummy inputs text_encoder_wrapper = self.model_loader.get_text_encoder_wrapper() @@ -50,7 +154,12 @@ def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: # Export to ATEN graph exported_program = export( - text_encoder_wrapper, dummy_inputs["text_encoder"] + text_encoder_wrapper, dummy_inputs[sd_model_type] + ) + exported_program_module = self.maybe_quantize_model(exported_program.module(), sd_model_type, is_quantization_enabled) + # Re-export the quantized torch.fx.GraphModule to ExportedProgram + exported_program = export( + exported_program_module, dummy_inputs[sd_model_type] ) # Configure OpenVINO compilation @@ -85,13 +194,20 @@ def export_unet(self, output_path: str, device: str = "CPU") -> bool: """Export UNet model to PTE file""" try: logger.info("Exporting UNet model with OpenVINO backend...") - + sd_model_type = "unet" + is_quantization_enabled = self.is_quantization_enabled + # Get wrapped model and dummy inputs unet_wrapper = self.model_loader.get_unet_wrapper() dummy_inputs = self.model_loader.get_dummy_inputs() # Export to ATEN graph - exported_program = export(unet_wrapper, dummy_inputs["unet"]) + exported_program = export(unet_wrapper, dummy_inputs[sd_model_type]) + exported_program_module = self.maybe_quantize_model(exported_program.module(), sd_model_type, is_quantization_enabled) + # Re-export the quantized torch.fx.GraphModule to ExportedProgram + exported_program = export( + exported_program_module, dummy_inputs[sd_model_type] + ) # Configure OpenVINO compilation compile_spec = [CompileSpec("device", device.encode())] @@ -125,13 +241,20 @@ def export_vae_decoder(self, output_path: str, device: str = "CPU") -> bool: """Export VAE decoder to PTE file""" try: logger.info("Exporting VAE decoder with OpenVINO backend...") + sd_model_type = "vae_decoder" + is_quantization_enabled = self.is_quantization_enabled # Get wrapped model and dummy inputs vae_decoder = self.model_loader.get_vae_decoder() dummy_inputs = self.model_loader.get_dummy_inputs() # Export to ATEN graph - exported_program = export(vae_decoder, dummy_inputs["vae_decoder"]) + exported_program = export(vae_decoder, dummy_inputs[sd_model_type]) + exported_program_module = self.maybe_quantize_model(exported_program.module(), sd_model_type, is_quantization_enabled) + # Re-export the quantized torch.fx.GraphModule to ExportedProgram + exported_program = export( + exported_program_module, dummy_inputs[sd_model_type] + ) # Configure OpenVINO compilation compile_spec = [CompileSpec("device", device.encode())] @@ -207,6 +330,12 @@ def create_argument_parser(): help="HuggingFace model ID for LCM (default: SimianLuo/LCM_Dreamshaper_v7)", ) + parser.add_argument( + "--quantize", + action="store_true", + help="Whether the Models should be quantized.", + ) + parser.add_argument( "--output_dir", type=str, @@ -245,7 +374,7 @@ def main() -> int: logger.info("=" * 60) logger.info("LCM Model Export") logger.info(f"Model: {args.model_id}") - logger.info(f"Device: {args.device} | Dtype: {args.dtype}") + logger.info(f"Device: {args.device} | Dtype: {args.dtype} | Quantize: {args.quantize}") logger.info("=" * 60) # Map dtype string to torch dtype @@ -253,7 +382,7 @@ def main() -> int: dtype = dtype_map[args.dtype] # Create exporter and load models - exporter = LCMOpenVINOExporter(args.model_id, dtype=dtype) + exporter = LCMOpenVINOExporter(args.model_id, is_quantization_enabled=args.quantize, dtype=dtype) if not exporter.load_models(): logger.error("Failed to load models") From 810214df354ad1841e5d3f3a3608618e25f025b9 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 3 Mar 2026 11:15:47 +0400 Subject: [PATCH 02/16] update readme --- examples/openvino/stable_diffusion/README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/examples/openvino/stable_diffusion/README.md b/examples/openvino/stable_diffusion/README.md index 1f5882c53a7..5bfa365afa6 100644 --- a/examples/openvino/stable_diffusion/README.md +++ b/examples/openvino/stable_diffusion/README.md @@ -25,6 +25,16 @@ python export_lcm.py \ --device CPU \ --dtype fp16 ``` + +To quantize the Unet and compress other models +```bash +python export_lcm.py \ + --model_id SimianLuo/LCM_Dreamshaper_v7 \ + --output_dir ./lcm_models \ + --device CPU \ + --quantize +``` + This will create three files in `./lcm_models/`: - `text_encoder.pte` - `unet.pte` @@ -33,6 +43,7 @@ This will create three files in `./lcm_models/`: ### Generate Images Run inference with the exported model: +Note: For quantized models, we currently only support running the runtime dtype should be FP32 ```bash python openvino_lcm.py \ From a7a41e3e1b3ad5ddbcedefcd6aa2d3242245d641 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 3 Mar 2026 12:02:20 +0400 Subject: [PATCH 03/16] lint --- .../openvino/stable_diffusion/export_lcm.py | 137 ++++++++++++------ 1 file changed, 91 insertions(+), 46 deletions(-) diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index 32489be0a31..223d277eaf0 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -10,10 +10,17 @@ import logging import os +import datasets # type: ignore[import-untyped] +import nncf # type: ignore[import-untyped] + import torch from executorch.backends.openvino.partitioner import OpenvinoPartitioner -from executorch.backends.openvino.quantizer import OpenVINOQuantizer, QuantizationMode, quantize_model +from executorch.backends.openvino.quantizer import ( + OpenVINOQuantizer, + QuantizationMode, + quantize_model, +) from executorch.examples.models.stable_diffusion.model import ( # type: ignore[import-untyped] LCMModelLoader, ) @@ -21,9 +28,7 @@ from executorch.exir.backend.backend_details import CompileSpec from torch.export import export from torchao.quantization.pt2e.quantizer.quantizer import Quantizer -import nncf -import datasets -from tqdm import tqdm +from tqdm import tqdm # type: ignore[import-untyped] # Configure logging logging.basicConfig(level=logging.INFO) @@ -36,11 +41,13 @@ class LCMOpenVINOExporter: def __init__( self, model_id: str = "SimianLuo/LCM_Dreamshaper_v7", - is_quantization_enabled = False, + is_quantization_enabled=False, dtype: torch.dtype = torch.float16, ): - if(is_quantization_enabled): - logger.warning("Quantization requires float32, overriding dtype from float16 to float32.") + if is_quantization_enabled: + logger.warning( + "Quantization requires float32, overriding dtype from float16 to float32." + ) dtype = torch.float32 self.is_quantization_enabled = is_quantization_enabled self.model_loader = LCMModelLoader(model_id=model_id, dtype=dtype) @@ -52,7 +59,7 @@ def load_models(self) -> bool: def should_quantize_model(self, sd_model_type): """ If this is true, then we should quantize activations and weights. Otherwise, only compress the weights. - + :param sd_model_type: the type of model in the stable diffusion pipeline such as Unet, text encoder, VAE etc. """ return sd_model_type == "unet" @@ -66,7 +73,9 @@ def get_ov_quantizer(self, sd_model_type: str) -> Quantizer: quantizer = OpenVINOQuantizer(mode=quantization_mode) return quantizer - def get_unet_calibration_dataset(self, calibration_dataset_size=200, num_inference_steps=4): + def get_unet_calibration_dataset( + self, calibration_dataset_size=200, num_inference_steps=4 + ): class UNetWrapper(torch.nn.Module): def __init__(self, model, config): super().__init__() @@ -83,15 +92,25 @@ def _pick(self, name: str, args, kwargs, idx: int): def forward(self, *args, **kwargs): """ - obtain and pass each input individually to ensure the order is maintained - and the right values are being passed according to the expected inputs by + obtain and pass each input individually to ensure the order is maintained + and the right values are being passed according to the expected inputs by the OpenVINO LCM runner. """ sample = self._pick("sample", args, kwargs, 0) timestep = self._pick("timestep", args, kwargs, 1) - encoder_hidden_states = self._pick("encoder_hidden_states", args, kwargs, 2) - timestep = timestep.unsqueeze(0) if len(timestep.shape) == 0 and isinstance(timestep, torch.Tensor) else timestep - unet_args = (sample, timestep, encoder_hidden_states,) + encoder_hidden_states = self._pick( + "encoder_hidden_states", args, kwargs, 2 + ) + timestep = ( + timestep.unsqueeze(0) + if len(timestep.shape) == 0 and isinstance(timestep, torch.Tensor) + else timestep + ) + unet_args = ( + sample, + timestep, + encoder_hidden_states, + ) self.captured_args.append(unet_args) return self.model(*unet_args) @@ -113,7 +132,9 @@ def forward(self, *args, **kwargs): if len(prompt.split()) > pipeline.tokenizer.model_max_length: continue # Run the pipeline - image = pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512) + pipeline( + prompt, num_inference_steps=num_inference_steps, height=512, width=512 + ) calibration_data.extend(wrapped_unet.captured_args) wrapped_unet.captured_args = [] pbar.update(len(calibration_data) - pbar.n) @@ -123,43 +144,65 @@ def forward(self, *args, **kwargs): return calibration_data def maybe_quantize_model(self, model, sd_model_type, is_quantization_enabled): - if(not is_quantization_enabled): + model = ( + model.module() if isinstance(model, torch.export.ExportedProgram) else model + ) + if not is_quantization_enabled: return model quantized_model = model ov_quantizer = self.get_ov_quantizer(sd_model_type) if self.should_quantize_model(sd_model_type): # Quantize activations for the Unet Model. Other models are weights-only quantized. calibration_dataset = self.get_unet_calibration_dataset() - from nncf.quantization.range_estimator import RangeEstimatorParametersSet - quantized_model = quantize_model(model, mode=QuantizationMode.INT8_TRANSFORMER, - calibration_dataset=calibration_dataset, - smooth_quant=True, - activations_range_estimator_params=RangeEstimatorParametersSet.MINMAX, - weights_range_estimator_params=RangeEstimatorParametersSet.MINMAX) + quantized_model = quantize_model( + model, + mode=QuantizationMode.INT8_TRANSFORMER, + calibration_dataset=calibration_dataset, + smooth_quant=True, + ) else: - quantized_model = nncf.experimental.torch.fx.compress_pt2e(model, quantizer=ov_quantizer) + quantized_model = nncf.experimental.torch.fx.compress_pt2e( + model, quantizer=ov_quantizer + ) return quantized_model + def _export_and_maybe_quantize( + self, model, dummy_inputs, sd_model_type, is_quantization_enabled + ): + exported_program = export(model, dummy_inputs) + exported_program_module = self.maybe_quantize_model( + exported_program.module(), sd_model_type, is_quantization_enabled + ) + # Re-export the quantized torch.fx.GraphModule to ExportedProgram + exported_program = export(exported_program_module, dummy_inputs) + return exported_program + def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: """Export CLIP text encoder to PTE file""" try: logger.info("Exporting text encoder with OpenVINO backend...") - + sd_model_type = "text_encoder" - is_quantization_enabled = self.is_quantization_enabled # Get wrapped model and dummy inputs text_encoder_wrapper = self.model_loader.get_text_encoder_wrapper() dummy_inputs = self.model_loader.get_dummy_inputs() # Export to ATEN graph - exported_program = export( - text_encoder_wrapper, dummy_inputs[sd_model_type] + exported_program = self._export_and_maybe_quantize( + text_encoder_wrapper, + dummy_inputs[sd_model_type], + sd_model_type, + self.is_quantization_enabled, ) - exported_program_module = self.maybe_quantize_model(exported_program.module(), sd_model_type, is_quantization_enabled) - # Re-export the quantized torch.fx.GraphModule to ExportedProgram - exported_program = export( - exported_program_module, dummy_inputs[sd_model_type] + + # Configure OpenVINO compilation + compile_spec = [CompileSpec("device", device.encode())] + partitioner = OpenvinoPartitioner(compile_spec) + + # Lower to edge dialect and apply OpenVINO backend + edge_manager = to_edge_transform_and_lower( + exported_program, partitioner=[partitioner] ) # Configure OpenVINO compilation @@ -195,18 +238,17 @@ def export_unet(self, output_path: str, device: str = "CPU") -> bool: try: logger.info("Exporting UNet model with OpenVINO backend...") sd_model_type = "unet" - is_quantization_enabled = self.is_quantization_enabled - + # Get wrapped model and dummy inputs unet_wrapper = self.model_loader.get_unet_wrapper() dummy_inputs = self.model_loader.get_dummy_inputs() # Export to ATEN graph - exported_program = export(unet_wrapper, dummy_inputs[sd_model_type]) - exported_program_module = self.maybe_quantize_model(exported_program.module(), sd_model_type, is_quantization_enabled) - # Re-export the quantized torch.fx.GraphModule to ExportedProgram - exported_program = export( - exported_program_module, dummy_inputs[sd_model_type] + exported_program = self._export_and_maybe_quantize( + unet_wrapper, + dummy_inputs[sd_model_type], + sd_model_type, + self.is_quantization_enabled, ) # Configure OpenVINO compilation @@ -242,18 +284,17 @@ def export_vae_decoder(self, output_path: str, device: str = "CPU") -> bool: try: logger.info("Exporting VAE decoder with OpenVINO backend...") sd_model_type = "vae_decoder" - is_quantization_enabled = self.is_quantization_enabled # Get wrapped model and dummy inputs vae_decoder = self.model_loader.get_vae_decoder() dummy_inputs = self.model_loader.get_dummy_inputs() # Export to ATEN graph - exported_program = export(vae_decoder, dummy_inputs[sd_model_type]) - exported_program_module = self.maybe_quantize_model(exported_program.module(), sd_model_type, is_quantization_enabled) - # Re-export the quantized torch.fx.GraphModule to ExportedProgram - exported_program = export( - exported_program_module, dummy_inputs[sd_model_type] + exported_program = self._export_and_maybe_quantize( + vae_decoder, + dummy_inputs[sd_model_type], + sd_model_type, + self.is_quantization_enabled, ) # Configure OpenVINO compilation @@ -374,7 +415,9 @@ def main() -> int: logger.info("=" * 60) logger.info("LCM Model Export") logger.info(f"Model: {args.model_id}") - logger.info(f"Device: {args.device} | Dtype: {args.dtype} | Quantize: {args.quantize}") + logger.info( + f"Device: {args.device} | Dtype: {args.dtype} | Quantize: {args.quantize}" + ) logger.info("=" * 60) # Map dtype string to torch dtype @@ -382,7 +425,9 @@ def main() -> int: dtype = dtype_map[args.dtype] # Create exporter and load models - exporter = LCMOpenVINOExporter(args.model_id, is_quantization_enabled=args.quantize, dtype=dtype) + exporter = LCMOpenVINOExporter( + args.model_id, is_quantization_enabled=args.quantize, dtype=dtype + ) if not exporter.load_models(): logger.error("Failed to load models") From a16420979f8213b4bbed993006f7658b1a265b15 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 3 Mar 2026 14:16:06 +0400 Subject: [PATCH 04/16] fix bugs; introduce support for fp16; add enum to maintain stable diffusion component model names reliable --- examples/models/stable_diffusion/__init__.py | 4 +- examples/models/stable_diffusion/model.py | 15 +- examples/openvino/stable_diffusion/README.md | 2 +- .../openvino/stable_diffusion/export_lcm.py | 170 +++++++++++------- .../stable_diffusion/requirements.txt | 1 + 5 files changed, 120 insertions(+), 72 deletions(-) diff --git a/examples/models/stable_diffusion/__init__.py b/examples/models/stable_diffusion/__init__.py index b749e67df66..eccf0e414af 100644 --- a/examples/models/stable_diffusion/__init__.py +++ b/examples/models/stable_diffusion/__init__.py @@ -4,6 +4,6 @@ # except in compliance with the License. See the license file found in the # LICENSE file in the root directory of this source tree. -from .model import LCMModelLoader, TextEncoderWrapper, UNetWrapper, VAEDecoder +from .model import LCMModelLoader, StableDiffusionComponent, TextEncoderWrapper, UNetWrapper, VAEDecoder -__all__ = ["LCMModelLoader", "TextEncoderWrapper", "UNetWrapper", "VAEDecoder"] +__all__ = ["LCMModelLoader", "StableDiffusionComponent", "TextEncoderWrapper", "UNetWrapper", "VAEDecoder"] diff --git a/examples/models/stable_diffusion/model.py b/examples/models/stable_diffusion/model.py index 6c55e2bb173..3590656d6d3 100644 --- a/examples/models/stable_diffusion/model.py +++ b/examples/models/stable_diffusion/model.py @@ -11,6 +11,7 @@ (OpenVINO, XNNPACK, etc.) for exporting Latent Consistency Models. """ +from enum import Enum import logging from typing import Any, Optional @@ -25,6 +26,12 @@ logger = logging.getLogger(__name__) +class StableDiffusionComponent(Enum): + """Maintain Stable Diffusion model components reliably""" + + TEXT_ENCODER = "text_encoder" + UNET = "unet" + VAE_DECODER = "vae_decoder" class TextEncoderWrapper(torch.nn.Module): """Wrapper for CLIP text encoder that extracts last_hidden_state""" @@ -150,7 +157,7 @@ def get_vae_decoder(self) -> VAEDecoder: raise ValueError("Models not loaded. Call load_models() first.") return VAEDecoder(self.vae) - def get_dummy_inputs(self): + def get_dummy_inputs(self) -> dict[StableDiffusionComponent, tuple[Any, ...]]: """ Get dummy inputs for each model component. @@ -187,7 +194,7 @@ def get_dummy_inputs(self): vae_input = torch.randn(1, 4, 64, 64, dtype=self.dtype) return { - "text_encoder": (text_encoder_input,), - "unet": unet_inputs, - "vae_decoder": (vae_input,), + StableDiffusionComponent.TEXT_ENCODER: (text_encoder_input,), + StableDiffusionComponent.UNET: unet_inputs, + StableDiffusionComponent.VAE_DECODER: (vae_input,), } diff --git a/examples/openvino/stable_diffusion/README.md b/examples/openvino/stable_diffusion/README.md index 5bfa365afa6..6afd0c8178b 100644 --- a/examples/openvino/stable_diffusion/README.md +++ b/examples/openvino/stable_diffusion/README.md @@ -32,6 +32,7 @@ python export_lcm.py \ --model_id SimianLuo/LCM_Dreamshaper_v7 \ --output_dir ./lcm_models \ --device CPU \ + --dtype fp16 \ --quantize ``` @@ -43,7 +44,6 @@ This will create three files in `./lcm_models/`: ### Generate Images Run inference with the exported model: -Note: For quantized models, we currently only support running the runtime dtype should be FP32 ```bash python openvino_lcm.py \ diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index 223d277eaf0..e4ead860a16 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -23,6 +23,7 @@ ) from executorch.examples.models.stable_diffusion.model import ( # type: ignore[import-untyped] LCMModelLoader, + StableDiffusionComponent, ) from executorch.exir import ExecutorchBackendConfig, to_edge_transform_and_lower from executorch.exir.backend.backend_details import CompileSpec @@ -41,14 +42,9 @@ class LCMOpenVINOExporter: def __init__( self, model_id: str = "SimianLuo/LCM_Dreamshaper_v7", - is_quantization_enabled=False, + is_quantization_enabled: bool = False, dtype: torch.dtype = torch.float16, ): - if is_quantization_enabled: - logger.warning( - "Quantization requires float32, overriding dtype from float16 to float32." - ) - dtype = torch.float32 self.is_quantization_enabled = is_quantization_enabled self.model_loader = LCMModelLoader(model_id=model_id, dtype=dtype) @@ -56,74 +52,96 @@ def load_models(self) -> bool: """Load the LCM pipeline and extract components""" return self.model_loader.load_models() - def should_quantize_model(self, sd_model_type): + @staticmethod + def should_quantize_model(sd_model_component: StableDiffusionComponent) -> bool: """ If this is true, then we should quantize activations and weights. Otherwise, only compress the weights. - :param sd_model_type: the type of model in the stable diffusion pipeline such as Unet, text encoder, VAE etc. + :param sd_model_component: the type of model in the stable diffusion pipeline such as Unet, text encoder, VAE etc. """ - return sd_model_type == "unet" + return sd_model_component == StableDiffusionComponent.UNET - def get_ov_quantizer(self, sd_model_type: str) -> Quantizer: + def get_ov_quantizer(self, sd_model_component: StableDiffusionComponent) -> Quantizer: quantization_mode = QuantizationMode.INT8WO_ASYM - if self.should_quantize_model(sd_model_type): + if self.should_quantize_model(sd_model_component): # Only Unet model will have both weights and activations quantized. quantization_mode = QuantizationMode.INT8_TRANSFORMER quantizer = OpenVINOQuantizer(mode=quantization_mode) return quantizer + @staticmethod + def _set_pipeline_dtype(pipeline, dtype: torch.dtype) -> None: + """Set core pipeline models to a target dtype.""" + pipeline.text_encoder.to(dtype=dtype) + pipeline.unet.to(dtype=dtype) + pipeline.vae.to(dtype=dtype) + + @staticmethod def get_unet_calibration_dataset( - self, calibration_dataset_size=200, num_inference_steps=4 - ): + pipeline, calibration_dataset_size: int = 200, num_inference_steps: int = 4 + ) -> list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: + """Collect UNet calibration inputs from prompts.""" class UNetWrapper(torch.nn.Module): - def __init__(self, model, config): + def __init__(self, model: torch.nn.Module, config): super().__init__() self.model = model self.config = config - self.captured_args = [] - - def _pick(self, name: str, args, kwargs, idx: int): + self.captured_args: list[ + tuple[torch.Tensor, torch.Tensor, torch.Tensor] + ] = [] + + def _pick_correct_arg_or_kwarg( + self, + name: str, + args, + kwargs, + idx: int, + ): if name in kwargs and kwargs[name] is not None: return kwargs[name] if len(args) > idx: return args[idx] raise KeyError(f"Missing required UNet input: {name}") - def forward(self, *args, **kwargs): - """ - obtain and pass each input individually to ensure the order is maintained - and the right values are being passed according to the expected inputs by - the OpenVINO LCM runner. - """ - sample = self._pick("sample", args, kwargs, 0) - timestep = self._pick("timestep", args, kwargs, 1) - encoder_hidden_states = self._pick( + def _process_inputs( + self, *args, **kwargs + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + sample = self._pick_correct_arg_or_kwarg("sample", args, kwargs, 0) + timestep = self._pick_correct_arg_or_kwarg("timestep", args, kwargs, 1) + encoder_hidden_states = self._pick_correct_arg_or_kwarg( "encoder_hidden_states", args, kwargs, 2 ) timestep = ( timestep.unsqueeze(0) - if len(timestep.shape) == 0 and isinstance(timestep, torch.Tensor) + if timestep.dim() == 0 and isinstance(timestep, torch.Tensor) else timestep ) - unet_args = ( + processed_args = ( sample, timestep, encoder_hidden_states, ) + return processed_args + + def forward(self, *args, **kwargs): + """ + obtain and pass each input individually to ensure the order is maintained + and the right values are being passed according to the expected inputs by + the OpenVINO LCM runner. + """ + unet_args = self._process_inputs(*args, **kwargs) self.captured_args.append(unet_args) - return self.model(*unet_args) + return self.model(*args, **kwargs) - pipeline = self.model_loader.pipeline calibration_data = [] dataset = datasets.load_dataset( "google-research-datasets/conceptual_captions", split="train", trust_remote_code=True, ).shuffle(seed=42) - - wrapped_unet = UNetWrapper(pipeline.unet, pipeline.unet.config) original_unet = pipeline.unet + wrapped_unet = UNetWrapper(pipeline.unet, pipeline.unet.config) pipeline.unet = wrapped_unet # Run inference for data collection pbar = tqdm(total=calibration_dataset_size) @@ -143,35 +161,57 @@ def forward(self, *args, **kwargs): pipeline.unet = original_unet return calibration_data - def maybe_quantize_model(self, model, sd_model_type, is_quantization_enabled): - model = ( - model.module() if isinstance(model, torch.export.ExportedProgram) else model - ) - if not is_quantization_enabled: + def maybe_quantize_model( + self, + model: torch.fx.GraphModule, + sd_model_component: StableDiffusionComponent, + is_quantization_enabled: bool, + ) -> torch.fx.GraphModule: + """Apply model quantization when enabled.""" + try: + if not is_quantization_enabled: + return model + quantized_model = model + ov_quantizer = self.get_ov_quantizer(sd_model_component) + if self.should_quantize_model(sd_model_component): + # Quantize activations for the Unet Model. Other models are weights-only quantized. + pipeline = self.model_loader.pipeline + try: + # We need the models in FP32 to run inference for calibration data collection + self._set_pipeline_dtype(pipeline, torch.float32) + calibration_dataset = self.get_unet_calibration_dataset(pipeline) + finally: + self._set_pipeline_dtype(pipeline, self.model_loader.dtype) + + quantized_model = quantize_model( + model, + mode=QuantizationMode.INT8_TRANSFORMER, + calibration_dataset=calibration_dataset, + smooth_quant=True, + ) + else: + quantized_model = nncf.experimental.torch.fx.compress_pt2e( + model, quantizer=ov_quantizer + ) + return quantized_model + except Exception as e: + logger.error(f"Quantization failed for {sd_model_component}: {e}") + import traceback + + traceback.print_exc() return model - quantized_model = model - ov_quantizer = self.get_ov_quantizer(sd_model_type) - if self.should_quantize_model(sd_model_type): - # Quantize activations for the Unet Model. Other models are weights-only quantized. - calibration_dataset = self.get_unet_calibration_dataset() - quantized_model = quantize_model( - model, - mode=QuantizationMode.INT8_TRANSFORMER, - calibration_dataset=calibration_dataset, - smooth_quant=True, - ) - else: - quantized_model = nncf.experimental.torch.fx.compress_pt2e( - model, quantizer=ov_quantizer - ) - return quantized_model def _export_and_maybe_quantize( - self, model, dummy_inputs, sd_model_type, is_quantization_enabled - ): + self, + model: torch.nn.Module, + dummy_inputs, + sd_model_component: StableDiffusionComponent, + is_quantization_enabled: bool, + ) -> torch.export.ExportedProgram: + """Export model and optionally quantize before re-export.""" exported_program = export(model, dummy_inputs) exported_program_module = self.maybe_quantize_model( - exported_program.module(), sd_model_type, is_quantization_enabled + exported_program.module(), sd_model_component, is_quantization_enabled ) # Re-export the quantized torch.fx.GraphModule to ExportedProgram exported_program = export(exported_program_module, dummy_inputs) @@ -182,7 +222,7 @@ def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: try: logger.info("Exporting text encoder with OpenVINO backend...") - sd_model_type = "text_encoder" + sd_model_component = StableDiffusionComponent.TEXT_ENCODER # Get wrapped model and dummy inputs text_encoder_wrapper = self.model_loader.get_text_encoder_wrapper() @@ -191,8 +231,8 @@ def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: # Export to ATEN graph exported_program = self._export_and_maybe_quantize( text_encoder_wrapper, - dummy_inputs[sd_model_type], - sd_model_type, + dummy_inputs[sd_model_component], + sd_model_component, self.is_quantization_enabled, ) @@ -237,7 +277,7 @@ def export_unet(self, output_path: str, device: str = "CPU") -> bool: """Export UNet model to PTE file""" try: logger.info("Exporting UNet model with OpenVINO backend...") - sd_model_type = "unet" + sd_model_component = StableDiffusionComponent.UNET # Get wrapped model and dummy inputs unet_wrapper = self.model_loader.get_unet_wrapper() @@ -246,8 +286,8 @@ def export_unet(self, output_path: str, device: str = "CPU") -> bool: # Export to ATEN graph exported_program = self._export_and_maybe_quantize( unet_wrapper, - dummy_inputs[sd_model_type], - sd_model_type, + dummy_inputs[sd_model_component], + sd_model_component, self.is_quantization_enabled, ) @@ -283,7 +323,7 @@ def export_vae_decoder(self, output_path: str, device: str = "CPU") -> bool: """Export VAE decoder to PTE file""" try: logger.info("Exporting VAE decoder with OpenVINO backend...") - sd_model_type = "vae_decoder" + sd_model_component = StableDiffusionComponent.VAE_DECODER # Get wrapped model and dummy inputs vae_decoder = self.model_loader.get_vae_decoder() @@ -292,8 +332,8 @@ def export_vae_decoder(self, output_path: str, device: str = "CPU") -> bool: # Export to ATEN graph exported_program = self._export_and_maybe_quantize( vae_decoder, - dummy_inputs[sd_model_type], - sd_model_type, + dummy_inputs[sd_model_component], + sd_model_component, self.is_quantization_enabled, ) diff --git a/examples/openvino/stable_diffusion/requirements.txt b/examples/openvino/stable_diffusion/requirements.txt index 4057c5ace9f..a775c0a4b6b 100644 --- a/examples/openvino/stable_diffusion/requirements.txt +++ b/examples/openvino/stable_diffusion/requirements.txt @@ -1 +1,2 @@ diffusers>=0.29.0 +tqdm From 1b605aa4a5d25bb85ac1c06a6870ff67ae116f70 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 3 Mar 2026 14:16:32 +0400 Subject: [PATCH 05/16] minor comment --- examples/models/stable_diffusion/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/models/stable_diffusion/model.py b/examples/models/stable_diffusion/model.py index 3590656d6d3..88c6ad79eac 100644 --- a/examples/models/stable_diffusion/model.py +++ b/examples/models/stable_diffusion/model.py @@ -27,7 +27,7 @@ logger = logging.getLogger(__name__) class StableDiffusionComponent(Enum): - """Maintain Stable Diffusion model components reliably""" + """Maintain Stable Diffusion model component names reliably""" TEXT_ENCODER = "text_encoder" UNET = "unet" From ab09c86cbda7243a99363dd8dccbb0db98192fbf Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 3 Mar 2026 14:30:15 +0400 Subject: [PATCH 06/16] lint --- examples/models/stable_diffusion/__init__.py | 16 ++++++++++++++-- examples/models/stable_diffusion/model.py | 4 +++- examples/openvino/stable_diffusion/export_lcm.py | 5 ++++- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/examples/models/stable_diffusion/__init__.py b/examples/models/stable_diffusion/__init__.py index eccf0e414af..af97caece9a 100644 --- a/examples/models/stable_diffusion/__init__.py +++ b/examples/models/stable_diffusion/__init__.py @@ -4,6 +4,18 @@ # except in compliance with the License. See the license file found in the # LICENSE file in the root directory of this source tree. -from .model import LCMModelLoader, StableDiffusionComponent, TextEncoderWrapper, UNetWrapper, VAEDecoder +from .model import ( + LCMModelLoader, + StableDiffusionComponent, + TextEncoderWrapper, + UNetWrapper, + VAEDecoder, +) -__all__ = ["LCMModelLoader", "StableDiffusionComponent", "TextEncoderWrapper", "UNetWrapper", "VAEDecoder"] +__all__ = [ + "LCMModelLoader", + "StableDiffusionComponent", + "TextEncoderWrapper", + "UNetWrapper", + "VAEDecoder", +] diff --git a/examples/models/stable_diffusion/model.py b/examples/models/stable_diffusion/model.py index 88c6ad79eac..606335778d1 100644 --- a/examples/models/stable_diffusion/model.py +++ b/examples/models/stable_diffusion/model.py @@ -11,8 +11,8 @@ (OpenVINO, XNNPACK, etc.) for exporting Latent Consistency Models. """ -from enum import Enum import logging +from enum import Enum from typing import Any, Optional import torch @@ -26,6 +26,7 @@ logger = logging.getLogger(__name__) + class StableDiffusionComponent(Enum): """Maintain Stable Diffusion model component names reliably""" @@ -33,6 +34,7 @@ class StableDiffusionComponent(Enum): UNET = "unet" VAE_DECODER = "vae_decoder" + class TextEncoderWrapper(torch.nn.Module): """Wrapper for CLIP text encoder that extracts last_hidden_state""" diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index e4ead860a16..6da6d0d3e44 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -61,7 +61,9 @@ def should_quantize_model(sd_model_component: StableDiffusionComponent) -> bool: """ return sd_model_component == StableDiffusionComponent.UNET - def get_ov_quantizer(self, sd_model_component: StableDiffusionComponent) -> Quantizer: + def get_ov_quantizer( + self, sd_model_component: StableDiffusionComponent + ) -> Quantizer: quantization_mode = QuantizationMode.INT8WO_ASYM if self.should_quantize_model(sd_model_component): # Only Unet model will have both weights and activations quantized. @@ -82,6 +84,7 @@ def get_unet_calibration_dataset( pipeline, calibration_dataset_size: int = 200, num_inference_steps: int = 4 ) -> list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: """Collect UNet calibration inputs from prompts.""" + class UNetWrapper(torch.nn.Module): def __init__(self, model: torch.nn.Module, config): super().__init__() From a291ed835ba1a5d110e80ecc42db5467d842eed3 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 4 Mar 2026 16:27:24 +0400 Subject: [PATCH 07/16] review changes --- .../openvino/stable_diffusion/export_lcm.py | 116 ++++++++++-------- .../openvino/stable_diffusion/openvino_lcm.py | 2 +- 2 files changed, 66 insertions(+), 52 deletions(-) diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index 6da6d0d3e44..741cbabc889 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -44,8 +44,14 @@ def __init__( model_id: str = "SimianLuo/LCM_Dreamshaper_v7", is_quantization_enabled: bool = False, dtype: torch.dtype = torch.float16, + calibration_dataset_name: str = "google-research-datasets/conceptual_captions", + calibration_dataset_column: str = "caption", ): + if is_quantization_enabled: + dtype = torch.float32 self.is_quantization_enabled = is_quantization_enabled + self.calibration_dataset_name = calibration_dataset_name + self.calibration_dataset_column = calibration_dataset_column self.model_loader = LCMModelLoader(model_id=model_id, dtype=dtype) def load_models(self) -> bool: @@ -72,16 +78,13 @@ def get_ov_quantizer( quantizer = OpenVINOQuantizer(mode=quantization_mode) return quantizer - @staticmethod - def _set_pipeline_dtype(pipeline, dtype: torch.dtype) -> None: - """Set core pipeline models to a target dtype.""" - pipeline.text_encoder.to(dtype=dtype) - pipeline.unet.to(dtype=dtype) - pipeline.vae.to(dtype=dtype) - @staticmethod def get_unet_calibration_dataset( - pipeline, calibration_dataset_size: int = 200, num_inference_steps: int = 4 + pipeline, + dataset_name: str, + dataset_column: str, + calibration_dataset_size: int = 200, + num_inference_steps: int = 4, ) -> list[tuple[torch.Tensor, torch.Tensor, torch.Tensor]]: """Collect UNet calibration inputs from prompts.""" @@ -129,7 +132,7 @@ def _process_inputs( def forward(self, *args, **kwargs): """ - obtain and pass each input individually to ensure the order is maintained + Obtain and pass each input individually to ensure the order is maintained and the right values are being passed according to the expected inputs by the OpenVINO LCM runner. """ @@ -138,18 +141,29 @@ def forward(self, *args, **kwargs): return self.model(*args, **kwargs) calibration_data = [] - dataset = datasets.load_dataset( - "google-research-datasets/conceptual_captions", - split="train", - trust_remote_code=True, - ).shuffle(seed=42) + try: + dataset = datasets.load_dataset( + dataset_name, + split="train", + trust_remote_code=True, + ).shuffle(seed=42) + except Exception as error: + raise RuntimeError( + f"Failed to load calibration dataset '{dataset_name}'" + ) from error original_unet = pipeline.unet wrapped_unet = UNetWrapper(pipeline.unet, pipeline.unet.config) pipeline.unet = wrapped_unet # Run inference for data collection pbar = tqdm(total=calibration_dataset_size) for batch in dataset: - prompt = batch["caption"] + if dataset_column not in batch: + raise RuntimeError( + f"Column '{dataset_column}' was not found in dataset '{dataset_name}'" + ) + prompt = batch[dataset_column] + if not isinstance(prompt, str): + prompt = str(prompt) if len(prompt.split()) > pipeline.tokenizer.model_max_length: continue # Run the pipeline @@ -171,20 +185,19 @@ def maybe_quantize_model( is_quantization_enabled: bool, ) -> torch.fx.GraphModule: """Apply model quantization when enabled.""" + if not is_quantization_enabled: + return model try: - if not is_quantization_enabled: - return model quantized_model = model ov_quantizer = self.get_ov_quantizer(sd_model_component) - if self.should_quantize_model(sd_model_component): + if sd_model_component == StableDiffusionComponent.UNET: # Quantize activations for the Unet Model. Other models are weights-only quantized. pipeline = self.model_loader.pipeline - try: - # We need the models in FP32 to run inference for calibration data collection - self._set_pipeline_dtype(pipeline, torch.float32) - calibration_dataset = self.get_unet_calibration_dataset(pipeline) - finally: - self._set_pipeline_dtype(pipeline, self.model_loader.dtype) + calibration_dataset = self.get_unet_calibration_dataset( + pipeline, + self.calibration_dataset_name, + self.calibration_dataset_column, + ) quantized_model = quantize_model( model, @@ -198,11 +211,10 @@ def maybe_quantize_model( ) return quantized_model except Exception as e: - logger.error(f"Quantization failed for {sd_model_component}: {e}") - import traceback - - traceback.print_exc() - return model + logger.error(f"Quantization failed for {sd_model_component.value}: {e}") + raise RuntimeError( + f"Quantization failed for {sd_model_component.value}: {e}" + ) from e def _export_and_maybe_quantize( self, @@ -248,15 +260,6 @@ def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: exported_program, partitioner=[partitioner] ) - # Configure OpenVINO compilation - compile_spec = [CompileSpec("device", device.encode())] - partitioner = OpenvinoPartitioner(compile_spec) - - # Lower to edge dialect and apply OpenVINO backend - edge_manager = to_edge_transform_and_lower( - exported_program, partitioner=[partitioner] - ) - # Convert to ExecuTorch program executorch_program = edge_manager.to_executorch( config=ExecutorchBackendConfig() @@ -414,12 +417,6 @@ def create_argument_parser(): help="HuggingFace model ID for LCM (default: SimianLuo/LCM_Dreamshaper_v7)", ) - parser.add_argument( - "--quantize", - action="store_true", - help="Whether the Models should be quantized.", - ) - parser.add_argument( "--output_dir", type=str, @@ -436,9 +433,23 @@ def create_argument_parser(): parser.add_argument( "--dtype", - choices=["fp16", "fp32"], + choices=["fp16", "fp32", "int8"], default="fp16", - help="Model data type (default: fp16)", + help="Model data type. Use int8 to enable PTQ quantization (default: fp16)", + ) + + parser.add_argument( + "--calibration_dataset_name", + type=str, + default="google-research-datasets/conceptual_captions", + help="HuggingFace dataset name used for UNet calibration when dtype=int8", + ) + + parser.add_argument( + "--calibration_dataset_column", + type=str, + default="caption", + help="Dataset column name used as prompt text for UNet calibration", ) parser.add_argument("--verbose", action="store_true", help="Enable verbose logging") @@ -458,18 +469,21 @@ def main() -> int: logger.info("=" * 60) logger.info("LCM Model Export") logger.info(f"Model: {args.model_id}") - logger.info( - f"Device: {args.device} | Dtype: {args.dtype} | Quantize: {args.quantize}" - ) + logger.info(f"Device: {args.device} | Dtype: {args.dtype}") logger.info("=" * 60) # Map dtype string to torch dtype - dtype_map = {"fp16": torch.float16, "fp32": torch.float32} + is_quantization_enabled = args.dtype == "int8" + dtype_map = {"fp16": torch.float16, "fp32": torch.float32, "int8": torch.float32} dtype = dtype_map[args.dtype] # Create exporter and load models exporter = LCMOpenVINOExporter( - args.model_id, is_quantization_enabled=args.quantize, dtype=dtype + args.model_id, + is_quantization_enabled=is_quantization_enabled, + dtype=dtype, + calibration_dataset_name=args.calibration_dataset_name, + calibration_dataset_column=args.calibration_dataset_column, ) if not exporter.load_models(): diff --git a/examples/openvino/stable_diffusion/openvino_lcm.py b/examples/openvino/stable_diffusion/openvino_lcm.py index f9d68a633a3..e85c66809a2 100644 --- a/examples/openvino/stable_diffusion/openvino_lcm.py +++ b/examples/openvino/stable_diffusion/openvino_lcm.py @@ -331,7 +331,7 @@ def create_argument_parser(): "--device", choices=["CPU", "GPU"], default="CPU", help="Target device" ) parser.add_argument( - "--dtype", choices=["fp16", "fp32"], default="fp16", help="Model dtype" + "--dtype", choices=["fp16", "fp32", "int8"], default="fp16", help="Model dtype" ) parser.add_argument( "--output_dir", type=str, default="./lcm_outputs", help="Output directory" From b27ef24850481338774e6cba97a28ee4fb6bbb24 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 4 Mar 2026 16:30:27 +0400 Subject: [PATCH 08/16] readme for quantize --- examples/openvino/stable_diffusion/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/openvino/stable_diffusion/README.md b/examples/openvino/stable_diffusion/README.md index 6afd0c8178b..2ab7f4bf9b7 100644 --- a/examples/openvino/stable_diffusion/README.md +++ b/examples/openvino/stable_diffusion/README.md @@ -26,14 +26,13 @@ python export_lcm.py \ --dtype fp16 ``` -To quantize the Unet and compress other models +To quantize the Unet with 8a8w and weights-only 16a8w quantize other models ```bash python export_lcm.py \ --model_id SimianLuo/LCM_Dreamshaper_v7 \ --output_dir ./lcm_models \ --device CPU \ - --dtype fp16 \ - --quantize + --dtype int8 ``` This will create three files in `./lcm_models/`: @@ -44,6 +43,7 @@ This will create three files in `./lcm_models/`: ### Generate Images Run inference with the exported model: +Note: For quantized models, pass `--dtype int8` ```bash python openvino_lcm.py \ From 5463abe37305e8cc22274f15ac43ebfa78937d0d Mon Sep 17 00:00:00 2001 From: anzr299 Date: Wed, 4 Mar 2026 16:33:46 +0400 Subject: [PATCH 09/16] minor fix --- .../openvino/stable_diffusion/export_lcm.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index 741cbabc889..3042e850dc1 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -141,16 +141,11 @@ def forward(self, *args, **kwargs): return self.model(*args, **kwargs) calibration_data = [] - try: - dataset = datasets.load_dataset( - dataset_name, - split="train", - trust_remote_code=True, - ).shuffle(seed=42) - except Exception as error: - raise RuntimeError( - f"Failed to load calibration dataset '{dataset_name}'" - ) from error + dataset = datasets.load_dataset( + dataset_name, + split="train", + trust_remote_code=True, + ).shuffle(seed=42) original_unet = pipeline.unet wrapped_unet = UNetWrapper(pipeline.unet, pipeline.unet.config) pipeline.unet = wrapped_unet @@ -188,7 +183,6 @@ def maybe_quantize_model( if not is_quantization_enabled: return model try: - quantized_model = model ov_quantizer = self.get_ov_quantizer(sd_model_component) if sd_model_component == StableDiffusionComponent.UNET: # Quantize activations for the Unet Model. Other models are weights-only quantized. @@ -212,9 +206,10 @@ def maybe_quantize_model( return quantized_model except Exception as e: logger.error(f"Quantization failed for {sd_model_component.value}: {e}") - raise RuntimeError( - f"Quantization failed for {sd_model_component.value}: {e}" - ) from e + import traceback + + traceback.print_exc() + return model def _export_and_maybe_quantize( self, From d457d76221d4fae2494724678869846068c8b3e5 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Thu, 5 Mar 2026 16:29:23 +0400 Subject: [PATCH 10/16] remove excess --- .../openvino/stable_diffusion/export_lcm.py | 22 +------------------ 1 file changed, 1 insertion(+), 21 deletions(-) diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index 3042e850dc1..1dbb5bfb850 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -58,26 +58,6 @@ def load_models(self) -> bool: """Load the LCM pipeline and extract components""" return self.model_loader.load_models() - @staticmethod - def should_quantize_model(sd_model_component: StableDiffusionComponent) -> bool: - """ - If this is true, then we should quantize activations and weights. Otherwise, only compress the weights. - - :param sd_model_component: the type of model in the stable diffusion pipeline such as Unet, text encoder, VAE etc. - """ - return sd_model_component == StableDiffusionComponent.UNET - - def get_ov_quantizer( - self, sd_model_component: StableDiffusionComponent - ) -> Quantizer: - quantization_mode = QuantizationMode.INT8WO_ASYM - if self.should_quantize_model(sd_model_component): - # Only Unet model will have both weights and activations quantized. - quantization_mode = QuantizationMode.INT8_TRANSFORMER - - quantizer = OpenVINOQuantizer(mode=quantization_mode) - return quantizer - @staticmethod def get_unet_calibration_dataset( pipeline, @@ -183,7 +163,6 @@ def maybe_quantize_model( if not is_quantization_enabled: return model try: - ov_quantizer = self.get_ov_quantizer(sd_model_component) if sd_model_component == StableDiffusionComponent.UNET: # Quantize activations for the Unet Model. Other models are weights-only quantized. pipeline = self.model_loader.pipeline @@ -200,6 +179,7 @@ def maybe_quantize_model( smooth_quant=True, ) else: + ov_quantizer = OpenVINOQuantizer(mode=QuantizationMode.INT8WO_ASYM) quantized_model = nncf.experimental.torch.fx.compress_pt2e( model, quantizer=ov_quantizer ) From e065b1b8c1fb42ba8b04d367467e7622323b98a5 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 10 Mar 2026 20:33:23 +0400 Subject: [PATCH 11/16] review changes --- .../openvino/stable_diffusion/export_lcm.py | 118 ++++++++---------- 1 file changed, 51 insertions(+), 67 deletions(-) diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index 1dbb5bfb850..f0105c72996 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -28,7 +28,6 @@ from executorch.exir import ExecutorchBackendConfig, to_edge_transform_and_lower from executorch.exir.backend.backend_details import CompileSpec from torch.export import export -from torchao.quantization.pt2e.quantizer.quantizer import Quantizer from tqdm import tqdm # type: ignore[import-untyped] # Configure logging @@ -153,59 +152,32 @@ def forward(self, *args, **kwargs): pipeline.unet = original_unet return calibration_data - def maybe_quantize_model( + def _quantize_unet_model( self, model: torch.fx.GraphModule, - sd_model_component: StableDiffusionComponent, - is_quantization_enabled: bool, ) -> torch.fx.GraphModule: - """Apply model quantization when enabled.""" - if not is_quantization_enabled: - return model - try: - if sd_model_component == StableDiffusionComponent.UNET: - # Quantize activations for the Unet Model. Other models are weights-only quantized. - pipeline = self.model_loader.pipeline - calibration_dataset = self.get_unet_calibration_dataset( - pipeline, - self.calibration_dataset_name, - self.calibration_dataset_column, - ) - - quantized_model = quantize_model( - model, - mode=QuantizationMode.INT8_TRANSFORMER, - calibration_dataset=calibration_dataset, - smooth_quant=True, - ) - else: - ov_quantizer = OpenVINOQuantizer(mode=QuantizationMode.INT8WO_ASYM) - quantized_model = nncf.experimental.torch.fx.compress_pt2e( - model, quantizer=ov_quantizer - ) - return quantized_model - except Exception as e: - logger.error(f"Quantization failed for {sd_model_component.value}: {e}") - import traceback - - traceback.print_exc() - return model + """Quantize UNet using activation-aware PTQ.""" + pipeline = self.model_loader.pipeline + calibration_dataset = self.get_unet_calibration_dataset( + pipeline, + self.calibration_dataset_name, + self.calibration_dataset_column, + ) - def _export_and_maybe_quantize( - self, - model: torch.nn.Module, - dummy_inputs, - sd_model_component: StableDiffusionComponent, - is_quantization_enabled: bool, - ) -> torch.export.ExportedProgram: - """Export model and optionally quantize before re-export.""" - exported_program = export(model, dummy_inputs) - exported_program_module = self.maybe_quantize_model( - exported_program.module(), sd_model_component, is_quantization_enabled + return quantize_model( + model, + mode=QuantizationMode.INT8_TRANSFORMER, + calibration_dataset=calibration_dataset, + smooth_quant=True, ) - # Re-export the quantized torch.fx.GraphModule to ExportedProgram - exported_program = export(exported_program_module, dummy_inputs) - return exported_program + + @staticmethod + def _compress_non_unet_model( + model: torch.fx.GraphModule, + ) -> torch.fx.GraphModule: + """Apply weights-only compression for non-UNet components.""" + ov_quantizer = OpenVINOQuantizer(mode=QuantizationMode.INT8WO_ASYM) + return nncf.experimental.torch.fx.compress_pt2e(model, quantizer=ov_quantizer) def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: """Export CLIP text encoder to PTE file""" @@ -219,12 +191,16 @@ def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: dummy_inputs = self.model_loader.get_dummy_inputs() # Export to ATEN graph - exported_program = self._export_and_maybe_quantize( - text_encoder_wrapper, - dummy_inputs[sd_model_component], - sd_model_component, - self.is_quantization_enabled, - ) + component_dummy_inputs = dummy_inputs[sd_model_component] + exported_program = export(text_encoder_wrapper, component_dummy_inputs) + + exported_program_module = exported_program.module() + if self.is_quantization_enabled: + exported_program_module = self._compress_non_unet_model( + exported_program_module + ) + # Re-export the transformed torch.fx.GraphModule to ExportedProgram + exported_program = export(exported_program_module, component_dummy_inputs) # Configure OpenVINO compilation compile_spec = [CompileSpec("device", device.encode())] @@ -265,12 +241,16 @@ def export_unet(self, output_path: str, device: str = "CPU") -> bool: dummy_inputs = self.model_loader.get_dummy_inputs() # Export to ATEN graph - exported_program = self._export_and_maybe_quantize( - unet_wrapper, - dummy_inputs[sd_model_component], - sd_model_component, - self.is_quantization_enabled, - ) + component_dummy_inputs = dummy_inputs[sd_model_component] + exported_program = export(unet_wrapper, component_dummy_inputs) + + exported_program_module = exported_program.module() + if self.is_quantization_enabled: + exported_program_module = self._quantize_unet_model( + exported_program_module + ) + # Re-export the transformed torch.fx.GraphModule to ExportedProgram + exported_program = export(exported_program_module, component_dummy_inputs) # Configure OpenVINO compilation compile_spec = [CompileSpec("device", device.encode())] @@ -311,12 +291,16 @@ def export_vae_decoder(self, output_path: str, device: str = "CPU") -> bool: dummy_inputs = self.model_loader.get_dummy_inputs() # Export to ATEN graph - exported_program = self._export_and_maybe_quantize( - vae_decoder, - dummy_inputs[sd_model_component], - sd_model_component, - self.is_quantization_enabled, - ) + component_dummy_inputs = dummy_inputs[sd_model_component] + exported_program = export(vae_decoder, component_dummy_inputs) + + exported_program_module = exported_program.module() + if self.is_quantization_enabled: + exported_program_module = self._compress_non_unet_model( + exported_program_module + ) + # Re-export the transformed torch.fx.GraphModule to ExportedProgram + exported_program = export(exported_program_module, component_dummy_inputs) # Configure OpenVINO compilation compile_spec = [CompileSpec("device", device.encode())] From d28c5cbc54840ad7515c769b36570f86ee1a1a23 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Tue, 10 Mar 2026 20:51:15 +0400 Subject: [PATCH 12/16] review changes --- .../openvino/stable_diffusion/export_lcm.py | 51 ++++++++++--------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index f0105c72996..da5853f5789 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -152,10 +152,11 @@ def forward(self, *args, **kwargs): pipeline.unet = original_unet return calibration_data - def _quantize_unet_model( + def quantize_unet_model( self, - model: torch.fx.GraphModule, - ) -> torch.fx.GraphModule: + model: torch.export.ExportedProgram, + dummy_inputs, + ) -> torch.export.ExportedProgram: """Quantize UNet using activation-aware PTQ.""" pipeline = self.model_loader.pipeline calibration_dataset = self.get_unet_calibration_dataset( @@ -163,21 +164,29 @@ def _quantize_unet_model( self.calibration_dataset_name, self.calibration_dataset_column, ) - - return quantize_model( + model = model.graph_module() + quantized_model = quantize_model( model, mode=QuantizationMode.INT8_TRANSFORMER, calibration_dataset=calibration_dataset, smooth_quant=True, ) + # Re-export the transformed torch.fx.GraphModule to ExportedProgram + quantized_exported_program = export(quantized_model, dummy_inputs) + return quantized_exported_program @staticmethod - def _compress_non_unet_model( - model: torch.fx.GraphModule, - ) -> torch.fx.GraphModule: + def compress_model( + model: torch.export.ExportedProgram, + dummy_inputs, + ) -> torch.export.ExportedProgram: """Apply weights-only compression for non-UNet components.""" + model = model.graph_module() ov_quantizer = OpenVINOQuantizer(mode=QuantizationMode.INT8WO_ASYM) - return nncf.experimental.torch.fx.compress_pt2e(model, quantizer=ov_quantizer) + quantized_model = nncf.experimental.torch.fx.compress_pt2e(model, quantizer=ov_quantizer) + # Re-export the transformed torch.fx.GraphModule to ExportedProgram + quantized_exported_program = export(quantized_model, dummy_inputs) + return quantized_exported_program def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: """Export CLIP text encoder to PTE file""" @@ -194,13 +203,11 @@ def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: component_dummy_inputs = dummy_inputs[sd_model_component] exported_program = export(text_encoder_wrapper, component_dummy_inputs) - exported_program_module = exported_program.module() if self.is_quantization_enabled: - exported_program_module = self._compress_non_unet_model( - exported_program_module + exported_program = self.compress_model( + exported_program, + component_dummy_inputs ) - # Re-export the transformed torch.fx.GraphModule to ExportedProgram - exported_program = export(exported_program_module, component_dummy_inputs) # Configure OpenVINO compilation compile_spec = [CompileSpec("device", device.encode())] @@ -244,13 +251,11 @@ def export_unet(self, output_path: str, device: str = "CPU") -> bool: component_dummy_inputs = dummy_inputs[sd_model_component] exported_program = export(unet_wrapper, component_dummy_inputs) - exported_program_module = exported_program.module() if self.is_quantization_enabled: - exported_program_module = self._quantize_unet_model( - exported_program_module + exported_program = self.quantize_unet_model( + exported_program, + component_dummy_inputs ) - # Re-export the transformed torch.fx.GraphModule to ExportedProgram - exported_program = export(exported_program_module, component_dummy_inputs) # Configure OpenVINO compilation compile_spec = [CompileSpec("device", device.encode())] @@ -294,13 +299,11 @@ def export_vae_decoder(self, output_path: str, device: str = "CPU") -> bool: component_dummy_inputs = dummy_inputs[sd_model_component] exported_program = export(vae_decoder, component_dummy_inputs) - exported_program_module = exported_program.module() if self.is_quantization_enabled: - exported_program_module = self._compress_non_unet_model( - exported_program_module + exported_program = self.compress_model( + exported_program, + component_dummy_inputs ) - # Re-export the transformed torch.fx.GraphModule to ExportedProgram - exported_program = export(exported_program_module, component_dummy_inputs) # Configure OpenVINO compilation compile_spec = [CompileSpec("device", device.encode())] From 6b7eadab6dbe936b725125401cb43ee0c2a0a254 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Fri, 13 Mar 2026 16:40:03 +0400 Subject: [PATCH 13/16] review changes --- examples/openvino/stable_diffusion/export_lcm.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index da5853f5789..72c761dd804 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -136,9 +136,8 @@ def forward(self, *args, **kwargs): f"Column '{dataset_column}' was not found in dataset '{dataset_name}'" ) prompt = batch[dataset_column] - if not isinstance(prompt, str): - prompt = str(prompt) - if len(prompt.split()) > pipeline.tokenizer.model_max_length: + tokenized = pipeline.tokenizer.encode(prompt) + if len(tokenized["input_ids"]) > pipeline.tokenizer.model_max_length: continue # Run the pipeline pipeline( From 9d7b0d7799346fc68bb0aaf77b151aa76cff4d1a Mon Sep 17 00:00:00 2001 From: anzr299 Date: Fri, 13 Mar 2026 17:11:38 +0400 Subject: [PATCH 14/16] lint --- examples/openvino/stable_diffusion/export_lcm.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index 72c761dd804..4802768dd31 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -182,7 +182,9 @@ def compress_model( """Apply weights-only compression for non-UNet components.""" model = model.graph_module() ov_quantizer = OpenVINOQuantizer(mode=QuantizationMode.INT8WO_ASYM) - quantized_model = nncf.experimental.torch.fx.compress_pt2e(model, quantizer=ov_quantizer) + quantized_model = nncf.experimental.torch.fx.compress_pt2e( + model, quantizer=ov_quantizer + ) # Re-export the transformed torch.fx.GraphModule to ExportedProgram quantized_exported_program = export(quantized_model, dummy_inputs) return quantized_exported_program @@ -204,8 +206,7 @@ def export_text_encoder(self, output_path: str, device: str = "CPU") -> bool: if self.is_quantization_enabled: exported_program = self.compress_model( - exported_program, - component_dummy_inputs + exported_program, component_dummy_inputs ) # Configure OpenVINO compilation @@ -252,8 +253,7 @@ def export_unet(self, output_path: str, device: str = "CPU") -> bool: if self.is_quantization_enabled: exported_program = self.quantize_unet_model( - exported_program, - component_dummy_inputs + exported_program, component_dummy_inputs ) # Configure OpenVINO compilation @@ -300,8 +300,7 @@ def export_vae_decoder(self, output_path: str, device: str = "CPU") -> bool: if self.is_quantization_enabled: exported_program = self.compress_model( - exported_program, - component_dummy_inputs + exported_program, component_dummy_inputs ) # Configure OpenVINO compilation From d6b89335ff11cf6fa1216bd5716bd4d01cf6c6a7 Mon Sep 17 00:00:00 2001 From: Aamir Nazir Date: Sat, 14 Mar 2026 14:37:45 +0400 Subject: [PATCH 15/16] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- examples/openvino/stable_diffusion/README.md | 2 +- .../openvino/stable_diffusion/export_lcm.py | 41 ++++++++++--------- 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/examples/openvino/stable_diffusion/README.md b/examples/openvino/stable_diffusion/README.md index 2ab7f4bf9b7..bd21fb9923d 100644 --- a/examples/openvino/stable_diffusion/README.md +++ b/examples/openvino/stable_diffusion/README.md @@ -26,7 +26,7 @@ python export_lcm.py \ --dtype fp16 ``` -To quantize the Unet with 8a8w and weights-only 16a8w quantize other models +To quantize the UNet with 8-bit activations and 8-bit weights (8a8w) and apply weights-only 16-bit quantization (16a8w) to the remaining components, run: ```bash python export_lcm.py \ --model_id SimianLuo/LCM_Dreamshaper_v7 \ diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index 4802768dd31..2f313e3b317 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -123,32 +123,35 @@ def forward(self, *args, **kwargs): dataset = datasets.load_dataset( dataset_name, split="train", - trust_remote_code=True, + trust_remote_code=False, ).shuffle(seed=42) original_unet = pipeline.unet wrapped_unet = UNetWrapper(pipeline.unet, pipeline.unet.config) pipeline.unet = wrapped_unet # Run inference for data collection pbar = tqdm(total=calibration_dataset_size) - for batch in dataset: - if dataset_column not in batch: - raise RuntimeError( - f"Column '{dataset_column}' was not found in dataset '{dataset_name}'" + try: + for batch in dataset: + if dataset_column not in batch: + raise RuntimeError( + f"Column '{dataset_column}' was not found in dataset '{dataset_name}'" + ) + prompt = batch[dataset_column] + tokenized = pipeline.tokenizer.encode(prompt) + if len(tokenized["input_ids"]) > pipeline.tokenizer.model_max_length: + continue + # Run the pipeline + pipeline( + prompt, num_inference_steps=num_inference_steps, height=512, width=512 ) - prompt = batch[dataset_column] - tokenized = pipeline.tokenizer.encode(prompt) - if len(tokenized["input_ids"]) > pipeline.tokenizer.model_max_length: - continue - # Run the pipeline - pipeline( - prompt, num_inference_steps=num_inference_steps, height=512, width=512 - ) - calibration_data.extend(wrapped_unet.captured_args) - wrapped_unet.captured_args = [] - pbar.update(len(calibration_data) - pbar.n) - if pbar.n >= calibration_dataset_size: - break - pipeline.unet = original_unet + calibration_data.extend(wrapped_unet.captured_args) + wrapped_unet.captured_args = [] + pbar.update(len(calibration_data) - pbar.n) + if pbar.n >= calibration_dataset_size: + break + finally: + pipeline.unet = original_unet + pbar.close() return calibration_data def quantize_unet_model( From d6db584c1dafcdc81fb15cdae56fcb0ed065ce01 Mon Sep 17 00:00:00 2001 From: anzr299 Date: Sat, 14 Mar 2026 15:47:40 +0400 Subject: [PATCH 16/16] review changes --- examples/openvino/stable_diffusion/README.md | 2 +- .../openvino/stable_diffusion/export_lcm.py | 20 +++++++++++-------- .../stable_diffusion/requirements.txt | 2 ++ 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/examples/openvino/stable_diffusion/README.md b/examples/openvino/stable_diffusion/README.md index bd21fb9923d..3aab6a557cf 100644 --- a/examples/openvino/stable_diffusion/README.md +++ b/examples/openvino/stable_diffusion/README.md @@ -26,7 +26,7 @@ python export_lcm.py \ --dtype fp16 ``` -To quantize the UNet with 8-bit activations and 8-bit weights (8a8w) and apply weights-only 16-bit quantization (16a8w) to the remaining components, run: +To quantize the UNet with 8-bit activations and 8-bit weights (8a8w) and apply weights-only 8-bit quantization (16a8w) to the remaining components, run: ```bash python export_lcm.py \ --model_id SimianLuo/LCM_Dreamshaper_v7 \ diff --git a/examples/openvino/stable_diffusion/export_lcm.py b/examples/openvino/stable_diffusion/export_lcm.py index 2f313e3b317..8325cd8b2ee 100644 --- a/examples/openvino/stable_diffusion/export_lcm.py +++ b/examples/openvino/stable_diffusion/export_lcm.py @@ -99,7 +99,7 @@ def _process_inputs( ) timestep = ( timestep.unsqueeze(0) - if timestep.dim() == 0 and isinstance(timestep, torch.Tensor) + if isinstance(timestep, torch.Tensor) and timestep.dim() == 0 else timestep ) processed_args = ( @@ -123,7 +123,7 @@ def forward(self, *args, **kwargs): dataset = datasets.load_dataset( dataset_name, split="train", - trust_remote_code=False, + streaming=True, ).shuffle(seed=42) original_unet = pipeline.unet wrapped_unet = UNetWrapper(pipeline.unet, pipeline.unet.config) @@ -137,12 +137,16 @@ def forward(self, *args, **kwargs): f"Column '{dataset_column}' was not found in dataset '{dataset_name}'" ) prompt = batch[dataset_column] - tokenized = pipeline.tokenizer.encode(prompt) - if len(tokenized["input_ids"]) > pipeline.tokenizer.model_max_length: + tokenized_prompt = pipeline.tokenizer.encode(prompt) + if len(tokenized_prompt) > pipeline.tokenizer.model_max_length: continue # Run the pipeline pipeline( - prompt, num_inference_steps=num_inference_steps, height=512, width=512 + prompt, + num_inference_steps=num_inference_steps, + height=512, + width=512, + output_type="latent", ) calibration_data.extend(wrapped_unet.captured_args) wrapped_unet.captured_args = [] @@ -166,7 +170,7 @@ def quantize_unet_model( self.calibration_dataset_name, self.calibration_dataset_column, ) - model = model.graph_module() + model = model.module() quantized_model = quantize_model( model, mode=QuantizationMode.INT8_TRANSFORMER, @@ -183,7 +187,7 @@ def compress_model( dummy_inputs, ) -> torch.export.ExportedProgram: """Apply weights-only compression for non-UNet components.""" - model = model.graph_module() + model = model.module() ov_quantizer = OpenVINOQuantizer(mode=QuantizationMode.INT8WO_ASYM) quantized_model = nncf.experimental.torch.fx.compress_pt2e( model, quantizer=ov_quantizer @@ -405,7 +409,7 @@ def create_argument_parser(): "--calibration_dataset_name", type=str, default="google-research-datasets/conceptual_captions", - help="HuggingFace dataset name used for UNet calibration when dtype=int8", + help="HuggingFace dataset used for UNet calibration when INT8 quantization is enabled", ) parser.add_argument( diff --git a/examples/openvino/stable_diffusion/requirements.txt b/examples/openvino/stable_diffusion/requirements.txt index a775c0a4b6b..5654386611d 100644 --- a/examples/openvino/stable_diffusion/requirements.txt +++ b/examples/openvino/stable_diffusion/requirements.txt @@ -1,2 +1,4 @@ diffusers>=0.29.0 tqdm +nncf==3.0.0 +datasets==3.6.0