microsoft · liujij · Mar 20, 2026 · Mar 25, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/sd-legacy-stable-diffusion-v1-5/VitisAI/README.md b/sd-legacy-stable-diffusion-v1-5/VitisAI/README.md
@@ -0,0 +1,86 @@
+## Stable Diffusion Optimization with ONNX Runtime VitisAI EP
+
+This folder contains sample Olive configurations to optimize **Stable Diffusion v1.5** subgraphs for the **VitisAI Execution Provider** on AMD NPU.
+
+## Supported models and configs
+
+| Model ID (Hugging Face) | Config file |
+|:---------------------|:------------|
+| `sd-legacy/stable-diffusion-v1-5` | `config_unet.json` |
+| `sd-legacy/stable-diffusion-v1-5` | `config_vae_decoder.json` |
+| `sd-legacy/stable-diffusion-v1-5` | `config_vae_encoder.json` |
+| `sd-legacy/stable-diffusion-v1-5` | `config_text_encoder.json` |
+| `sd-legacy/stable-diffusion-v1-5` | `config_safety_checker.json` |
+
+## Run the VitisAI workflow
+
+#### Create a conda environment and install Olive
+
+```bash
+conda create -n olive python=3.12
+conda activate olive
+```
+
+```bash
+git clone https://github.com/microsoft/Olive.git
+cd Olive
+pip install -e .
+```
+
+#### Install VitisAI Stable Diffusion dependencies
+
+```bash
+git clone https://github.com/microsoft/olive-recipes.git
+cd olive-recipes/sd-legacy-stable-diffusion-v1-5/VitisAI
+pip install --force-reinstall -r requirements_vitisai_sd.txt
+```
+
+## Generate optimized subgraphs (optional)
+
+Run Olive to generate NPU-ready optimized submodels.
+
+> **Note:** This step is optional. If you only need the optimized ONNX models for NPU, you can run this step alone without the full pipeline.
+
+```bash
+cd olive-recipes/sd-legacy-stable-diffusion-v1-5/VitisAI
+
+olive run --config ../VitisAI/config_unet.json
+olive run --config ../VitisAI/config_vae_decoder.json
+olive run --config ../VitisAI/config_vae_encoder.json
+olive run --config ../VitisAI/config_text_encoder.json
+olive run --config ../VitisAI/config_safety_checker.json
+```
+
+Optimized artifacts are written to the `output_dir` defined in each JSON (for example `footprints/unet`, `footprints/vae_decoder`, …).
+
+> **Note:** Exact paths depend on `output_dir` and `cache_dir` in each config file.
+
+### Execution provider and hardware placement
+
+| Component | Execution provider | Compute device |
+|-----------|-------------------|----------------|
+| UNet | VitisAI EP | NPU |
+| VAE decoder | VitisAI EP | NPU |
+| Text encoder | CPU EP | CPU |
+| VAE encoder | CPU EP | CPU |
+| Safety checker | CPU EP | CPU |
+
+The VitisAI Execution Provider is used only for the **UNet** and **VAE decoder**. All other subgraphs run with the **CPU Execution Provider** on the host CPU.
+
+### End-to-end image generation (inference)
+
+```bash
+cd olive-recipes/sd-legacy-stable-diffusion-v1-5/VitisAI
+
+python stable_diffusion.py --provider vitisai --model_id sd-legacy/stable-diffusion-v1-5 --seed 0 --guidance_scale 7.5 --num_inference_steps 20 --prompt "Photo of an ultra realistic sailing ship, dramatic light, pale sunrise, cinematic lighting, battered, low angle, trending on artstation, 4k, hyper realistic, focused, extreme details, unreal engine 5, cinematic, masterpiece, art by studio ghibli, intricate artwork by john william turner."
+```
+
+## Outputs (relative to `VitisAI/`)
+
+| Item | Location |
+|:-----|:---------|
+| Generated images | `result_0.png`, `result_1.png`, … in the **current working directory** (typically `VitisAI/` if you run the command from there) |
+| Full pipeline, unoptimized | `model/unoptimized/<model_id>/` |
+| Full pipeline, optimized (VitisAI) | `model/optimized-vitisai/<model_id>/` |
+
+`model_id` slashes become nested folders (e.g. `sd-legacy/stable-diffusion-v1-5`). Per-subgraph `olive run` outputs use each config’s `output_dir` / `cache_dir` (e.g. under `footprints/`, `vai_cache/`).
diff --git a/sd-legacy-stable-diffusion-v1-5/VitisAI/assets/result.png b/sd-legacy-stable-diffusion-v1-5/VitisAI/assets/result.png
diff --git a/sd-legacy-stable-diffusion-v1-5/VitisAI/config_safety_checker.json b/sd-legacy-stable-diffusion-v1-5/VitisAI/config_safety_checker.json
@@ -0,0 +1,55 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "sd-legacy/stable-diffusion-v1-5",
+        "model_loader": "safety_checker_load",
+        "script_dir": ".",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "clip_input", "images" ],
+            "output_names": [ "out_images", "has_nsfw_concepts" ],
+            "dynamic_axes": {
+                "clip_input": { "0": "batch", "1": "channels", "2": "height", "3": "width" },
+                "images": { "0": "batch", "1": "height", "2": "width", "3": "channels" }
+            }
+        },
+        "dummy_inputs_func": "safety_checker_conversion_inputs"
+    },
+    "passes": {
+        "convert": { "type": "OnnxConversion", "target_opset": 14 },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "unet",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": true,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": true,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": true,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            },
+            "force_fp32_ops": [ "RandomNormalLike" ],
+            "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] }
+        }
+    },
+    "log_severity_level": 0,
+    "cache_dir": "vai_cache",
+    "output_dir": "footprints/safety_checker"
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/VitisAI/config_text_encoder.json b/sd-legacy-stable-diffusion-v1-5/VitisAI/config_text_encoder.json
@@ -0,0 +1,57 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "sd-legacy/stable-diffusion-v1-5",
+        "model_loader": "text_encoder_load",
+        "script_dir": ".",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "input_ids" ],
+            "output_names": [ "last_hidden_state", "pooler_output" ],
+            "dynamic_axes": { "input_ids": { "0": "batch", "1": "sequence" } }
+        },
+        "dummy_inputs_func": "text_encoder_conversion_inputs"
+    },
+    "passes": {
+        "convert": { "type": "OnnxConversion", "target_opset": 14 },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "clip",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": true,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": true,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": true,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            },
+            "force_fp32_ops": [ "RandomNormalLike" ],
+            "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] }
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [ "batch", "sequence" ],
+            "dim_value": [ 1, 77 ]
+        }
+    },
+    "log_severity_level": 0,
+    "cache_dir": "vai_cache",
+    "output_dir": "footprints/text_encoder"
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/VitisAI/config_unet.json b/sd-legacy-stable-diffusion-v1-5/VitisAI/config_unet.json
@@ -0,0 +1,40 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "sd-legacy/stable-diffusion-v1-5",
+        "model_loader": "unet_load",
+        "script_dir": ".",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "sample", "timestep", "encoder_hidden_states", "return_dict" ],
+            "output_names": [ "out_sample" ],
+            "dynamic_axes": {
+                "sample": {
+                    "0": "batch",
+                    "1": "channels",
+                    "2": "height",
+                    "3": "width"
+                },
+                "encoder_hidden_states": { "0": "batch", "1": "sequence" }
+            }
+        },
+        "dummy_inputs_func": "unet_conversion_inputs"
+    },
+    "passes": {
+        "convert": {
+            "type": "OnnxConversion",
+            "target_opset": 17,
+            "save_as_external_data": true,
+            "all_tensors_to_one_file": true,
+            "external_data_name": "weights.pb"
+        },
+        "model_generation": {
+            "type": "VitisGenerateModelSD",
+            "model_type": "sd-unet",
+            "resolutions": ["512x512"]
+        }
+    },
+    "log_severity_level": 0,
+    "cache_dir": "vai_cache",
+    "output_dir": "footprints/unet"
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/VitisAI/config_vae_decoder.json b/sd-legacy-stable-diffusion-v1-5/VitisAI/config_vae_decoder.json
@@ -0,0 +1,33 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "sd-legacy/stable-diffusion-v1-5",
+        "model_loader": "vae_decoder_load",
+        "script_dir": ".",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "latent_sample", "return_dict" ],
+            "output_names": [ "sample" ],
+            "dynamic_axes": {
+                "latent_sample": {
+                    "0": "batch",
+                    "1": "channels",
+                    "2": "height",
+                    "3": "width"
+                }
+            }
+        },
+        "dummy_inputs_func": "vae_decoder_conversion_inputs"
+    },
+    "passes": {
+        "convert": { "type": "OnnxConversion", "target_opset": 17 },
+        "model_generation": {
+            "type": "VitisGenerateModelSD",
+            "model_type": "sd15-vae-decoder",
+            "resolutions": ["512x512"]
+        }
+    },
+    "log_severity_level": 0,
+    "cache_dir": "vai_cache",
+    "output_dir": "footprints/vae_decoder"
+}
diff --git a/sd-legacy-stable-diffusion-v1-5/VitisAI/config_vae_encoder.json b/sd-legacy-stable-diffusion-v1-5/VitisAI/config_vae_encoder.json
@@ -0,0 +1,68 @@
+{
+    "input_model": {
+        "type": "PyTorchModel",
+        "model_path": "sd-legacy/stable-diffusion-v1-5",
+        "model_loader": "vae_encoder_load",
+        "script_dir": ".",
+        "model_script": "user_script.py",
+        "io_config": {
+            "input_names": [ "sample", "return_dict" ],
+            "output_names": [ "latent_sample" ],
+            "dynamic_axes": {
+                "sample": { "0": "encoder_batch", "1": "encoder_channels", "2": "encoder_height", "3": "encoder_width" }
+            }
+        },
+        "dummy_inputs_func": "vae_encoder_conversion_inputs"
+    },
+    "passes": {
+        "convert": { "type": "OnnxConversion", "target_opset": 17 },
+        "optimize": {
+            "type": "OrtTransformersOptimization",
+            "model_type": "vae",
+            "opt_level": 0,
+            "float16": true,
+            "use_gpu": true,
+            "keep_io_types": false,
+            "optimization_options": {
+                "enable_gelu": true,
+                "enable_layer_norm": true,
+                "enable_attention": true,
+                "use_multi_head_attention": true,
+                "enable_skip_layer_norm": false,
+                "enable_embed_layer_norm": true,
+                "enable_bias_skip_layer_norm": false,
+                "enable_bias_gelu": true,
+                "enable_gelu_approximation": false,
+                "enable_qordered_matmul": false,
+                "enable_shape_inference": true,
+                "enable_gemm_fast_gelu": false,
+                "enable_nhwc_conv": false,
+                "enable_group_norm": true,
+                "enable_bias_splitgelu": false,
+                "enable_packed_qkv": true,
+                "enable_packed_kv": true,
+                "enable_bias_add": false,
+                "group_norm_channels_last": false
+            },
+            "force_fp32_ops": [ "RandomNormalLike" ],
+            "force_fp16_inputs": { "GroupNorm": [ 0, 1, 2 ] }
+        },
+        "dynamic_shape_to_fixed": {
+            "type": "DynamicToFixedShape",
+            "dim_param": [
+                "encoder_batch",
+                "encoder_channels",
+                "encoder_height",
+                "encoder_width",
+                "Addlatent_sample_dim_0",
+                "Addlatent_sample_dim_1",
+                "Addlatent_sample_dim_2",
+                "Addlatent_sample_dim_3"
+            ],
+            "dim_value": [ 1, 3, 512, 512, 1, 4, 64, 64 ]
+        }
+    },
+    "log_severity_level": 0,
+    "cache_dir": "vai_cache",
+    "output_dir": "footprints/vae_encoder"
+}