diff --git a/mistralai-Ministral-3-3B-Instruct-2512/baseline/README.md b/mistralai-Ministral-3-3B-Instruct-2512/baseline/README.md new file mode 100644 index 00000000..34d1989c --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/baseline/README.md @@ -0,0 +1,37 @@ +# mistralai-Ministral-3-3B-Instruct-2512 - Baseline PyTorch Evaluation + +This folder contains the PyTorch baseline evaluation recipe for `mistralai/Ministral-3-3B-Instruct-2512-BF16` on AI2D. It evaluates the Hugging Face BF16 model without ONNX conversion or quantization. + +## Setup + +```bash +pip install -r requirements.txt +``` + +## Run baseline evaluation + +```bash +python mistralai-Ministral-3-3B-Instruct-2512_pytorch_with_eval.py --device cuda --num_samples 100 +``` + +The script defaults to `mistralai/Ministral-3-3B-Instruct-2512-BF16`, because the default FP8 checkpoint requires FP8 kernels that are not available in all PyTorch environments. To evaluate a local or alternate checkpoint, pass `--pytorch_model`. + +```bash +python mistralai-Ministral-3-3B-Instruct-2512_pytorch_with_eval.py --pytorch_model /path/to/checkpoint --device cuda --num_samples 100 +``` + +## Baseline results + +Evaluated on AI2D with the default BF16 Hugging Face checkpoint. + +| Model | Device | Precision | Samples | Accuracy | Latency (s/sample) | +|-------|--------|-----------|---------|----------|---------------------| +| `mistralai/Ministral-3-3B-Instruct-2512-BF16` | CUDA | FP16 | 500 | 74.20% (371/500) | 0.18 | + +Command: + +```bash +python mistralai-Ministral-3-3B-Instruct-2512_pytorch_with_eval.py --pytorch_model mistralai/Ministral-3-3B-Instruct-2512-BF16 --device cuda --num_samples 500 +``` + +Latency is per-sample end-to-end inference time and excludes model loading. diff --git a/mistralai-Ministral-3-3B-Instruct-2512/baseline/info.yaml b/mistralai-Ministral-3-3B-Instruct-2512/baseline/info.yaml new file mode 100644 index 00000000..d5c7b4f9 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/baseline/info.yaml @@ -0,0 +1,8 @@ +arch: mistral3 +recipes: + - name: mistralai-Ministral-3-3B-Instruct-2512_pytorch_with_eval + file: mistralai-Ministral-3-3B-Instruct-2512_pytorch_with_eval.py + devices: + - cpu + - gpu + eps: PyTorch diff --git a/mistralai-Ministral-3-3B-Instruct-2512/baseline/mistralai-Ministral-3-3B-Instruct-2512_pytorch_with_eval.py b/mistralai-Ministral-3-3B-Instruct-2512/baseline/mistralai-Ministral-3-3B-Instruct-2512_pytorch_with_eval.py new file mode 100644 index 00000000..e38e883b --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/baseline/mistralai-Ministral-3-3B-Instruct-2512_pytorch_with_eval.py @@ -0,0 +1,39 @@ +"""PyTorch baseline evaluation for Ministral-3-3B on AI2D.""" + +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + + +DEFAULT_PYTORCH_MODEL = "mistralai/Ministral-3-3B-Instruct-2512-BF16" + + +def _has_arg(argv: list[str], name: str) -> bool: + return any(arg == name or arg.startswith(f"{name}=") for arg in argv) + + +def _load_builtin_eval(): + eval_path = Path(__file__).resolve().parents[1] / "builtin" / "eval.py" + spec = importlib.util.spec_from_file_location("ministral_builtin_eval", eval_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Unable to load evaluator from {eval_path}") + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +def main() -> None: + argv = sys.argv[1:] + if not _has_arg(argv, "--skip_onnx"): + argv.insert(0, "--skip_onnx") + if not _has_arg(argv, "--pytorch_model"): + argv.extend(["--pytorch_model", DEFAULT_PYTORCH_MODEL]) + + sys.argv = [sys.argv[0], *argv] + _load_builtin_eval().main() + + +if __name__ == "__main__": + main() diff --git a/mistralai-Ministral-3-3B-Instruct-2512/baseline/requirements.txt b/mistralai-Ministral-3-3B-Instruct-2512/baseline/requirements.txt new file mode 100644 index 00000000..e2491876 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/baseline/requirements.txt @@ -0,0 +1,7 @@ +accelerate +datasets +pillow +sentencepiece +torch>=2.10.0,<2.11.0 +torchvision +transformers>=4.57.0 diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore b/mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore new file mode 100644 index 00000000..3b614474 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/.gitignore @@ -0,0 +1,9 @@ +# Generated model artifacts +models/ + +# Python bytecode +__pycache__/ +*.pyc + +# Olive cache +.olive-cache/ diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md b/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md new file mode 100644 index 00000000..d435a0b7 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/README.md @@ -0,0 +1,242 @@ +# Ministral-3-3B ONNX Runtime GenAI Example + +This example demonstrates how to convert [Ministral-3-3B-Instruct-2512-BF16](https://huggingface.co/mistralai/Ministral-3-3B-Instruct-2512-BF16) vision-language model to ONNX format using Olive and run inference with ONNX Runtime GenAI. + +Ministral-3-3B is a multimodal (VLM) model combining a Pixtral vision encoder with a Mistral text decoder using YaRN RoPE for extended context. The pipeline exports three sub-models: +- **Vision encoder** and **embedding** via Olive/MobiusBuilder pass (`vision_embedding_export.json`); vision INT8-quantized via Olive +- **Text decoder** via Olive/ModelBuilder (GQA + k_quant_mixed INT4 quantization) + +## Exported Configurations + +| Component | CUDA | CPU | WebGPU | +|-----------|------|-----|--------| +| Text decoder | k_quant_mixed INT4 (`MatMulNBits`) | k_quant_mixed INT4 (`MatMulNBits`) | k_quant_mixed INT4 (`MatMulNBits`) | +| Vision encoder | INT8 RTN, asymmetric block 32 (`MatMulNBits`) | INT8 RTN, symmetric block 128 (`MatMulNBits`) | INT8 RTN, asymmetric block 32 (`MatMulNBits`) | +| Embedding | FP16 | FP32 | FP16 | + +- **CUDA**: k_quant_mixed INT4 text decoder + asymmetric block-32 INT8 vision + FP16 embedding. Optimized for throughput on NVIDIA GPUs. +- **CPU**: k_quant_mixed INT4 text decoder + INT8 vision + FP32 embedding. Uses FP32 for embedding (CPU EP promotes FP16 to FP32). +- **WebGPU**: k_quant_mixed INT4 text decoder + asymmetric block-32 INT8 vision + FP16 embedding. Uses WebGPU provider options in `genai_config.json`. + +## Benchmark Results + +Evaluated on [AI2D](https://allenai.org/data/diagrams) (science diagram multiple-choice QA, 4 options per question). + +| Configuration | Accuracy | Samples | Model Size | Latency (s/sample) | +|---------------|----------|---------|------------|---------------------| +| PyTorch FP16 (CUDA, BF16 checkpoint) | 74.20% (371/500) | 500 | N/A | 0.17 | +| ONNX CUDA INT4 text + INT8 vision (asym block 32, BF16 checkpoint) | 73.00% (365/500) | 500 | 3.86 GB | 0.11 | +| ONNX CPU INT4 text + INT8 vision (sym block 128, BF16 checkpoint) | 72.80% (364/500) | 500 | 4.92 GB | 8.05 | + +The current CUDA ONNX export is faster than PyTorch on this benchmark and is **1.20pp lower** in accuracy. A vision-quantization sweep found that asymmetric block-32 INT8 vision preserves the Mobius FP16 vision features best: on the feature probe it matched FP16 vision cosine similarity (`0.864774` vs `0.864780`), and on 500 AI2D samples it reached 365/500 versus 367/500 for the same package with unquantized FP16 vision. + +Export validation status: + +| Target | Package Size | Validation | +|--------|--------------|------------| +| CPU | 4.92 GB | Exported and evaluated; decoder, embedding, and vision ONNX graphs load; no hard-linked external data files | +| CUDA | 3.86 GB | Exported and evaluated; decoder and vision ONNX graphs contain `MatMulNBits`; no hard-linked external data files | +| WebGPU | 3.86 GB | Exported; decoder, embedding, and vision ONNX graphs load; no hard-linked external data files | + +> **Latency Measurement:** Per-sample end-to-end inference time (image in → text out). Includes image preprocessing, tokenization, vision encoding, text generation, and decoding. Answers are short (typically 1-2 tokens for multiple-choice). Excludes model loading (one-time cost). Measured with `time.perf_counter()` averaged over all samples. No warmup run. + +## Prerequisites + +```bash +pip install -r requirements.txt +``` + +Install ONNX Runtime GenAI: + +| Device | Install Command | +|--------|-----------------| +| CPU | `pip install onnxruntime-genai --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple` | +| GPU (CUDA) | `pip install onnxruntime-genai-cuda --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple` | + +## Steps + +### 1. Export & Optimize Models + +**CPU (k_quant_mixed INT4 text + INT8 vision + FP32 embedding):** + +```bash +python optimize.py --config-dir cpu_and_mobile --device cpu +``` + +**CUDA (k_quant_mixed INT4 text + INT8 vision + FP16 embedding):** + +```bash +python optimize.py --config-dir cuda --device gpu +``` + +**WebGPU (k_quant_mixed INT4 text + INT8 vision + FP16 embedding):** + +```bash +python optimize.py --config-dir webgpu --device webgpu +``` + +**With a local or alternate checkpoint:** + +```bash +python optimize.py --config-dir cpu_and_mobile --device cpu --model-path /path/to/Ministral-3-3B-dequantized +``` + +This runs: +- **Olive/ModelBuilder** for text decoder (GQA attention, YaRN RoPE, k_quant_mixed INT4) +- **Olive/MobiusBuilder** (`vision_embedding_export.json`) for vision encoder (Pixtral, dynamic H×W, 2D RoPE) and embedding (token + image fusion) +- **Olive INT8 quantization** (`vision.json`) on vision encoder (CPU, CUDA, and WebGPU) + +Then generates `genai_config.json` and `processor_config.json` for the ORT GenAI runtime. + +### 2. Output Structure + +``` +cpu_and_mobile/models/ # or cuda/ or webgpu/models/ +├── decoder/ +│ ├── model.onnx # Text decoder (Mistral + YaRN) +│ └── model.onnx.data +├── vision/ +│ ├── model.onnx # Pixtral vision encoder (INT8) +│ └── model.onnx.data +├── embedding/ +│ ├── model.onnx # Embedding fusion model (FP16/FP32) +│ └── model.onnx.data +├── genai_config.json # Runtime configuration +├── processor_config.json # Pixtral image preprocessing +├── tokenizer.json +└── tokenizer_config.json +``` + +### 3. Run Inference + +```bash +# Text-only +python inference.py --prompt "What is the capital of France?" + +# Image + text +python inference.py --image photo.jpg --prompt "Describe this image" + +# Interactive mode +python inference.py --interactive + +# CUDA model +python inference.py --model_path cuda/models --prompt "Hello" +``` + +Alternatively, use the built-in GenAI multimodal demo: + +```bash +python -m onnxruntime_genai.models.model_mm -m cpu_and_mobile/models --max_length 4096 +``` + +### 4. Evaluate + +Run the AI2D science diagram QA benchmark (see [Benchmark Results](#benchmark-results) for expected accuracy): + +```bash +# ONNX only (CPU) +python eval.py --device cpu --model_path cpu_and_mobile/models + +# ONNX only (CUDA) +python eval.py --device cuda --model_path cuda/models + +# PyTorch baseline (BF16 variant avoids FP8 kernel requirement) +python eval.py --skip_onnx --pytorch_model mistralai/Ministral-3-3B-Instruct-2512-BF16 --device cpu --num_samples 100 + +# Compare ONNX vs PyTorch side-by-side +python eval.py --model_path cuda/models --pytorch_model mistralai/Ministral-3-3B-Instruct-2512-BF16 --num_samples 100 +``` + +> **Note:** This recipe uses the BF16 Hugging Face checkpoint by default. The FP8 checkpoint +> (`Ministral-3-3B-Instruct-2512`) can require CUDA kernels that are not available on all machines. + +## Directory Structure + +``` +mistralai-Ministral-3-3B-Instruct-2512/builtin/ +├── cpu_and_mobile/ +│ ├── text.json # k_quant_mixed INT4 text decoder config (Olive/ModelBuilder) +│ ├── vision_embedding_export.json # Vision+embedding export (Olive/MobiusBuilder, FP32) +│ └── vision.json # INT8 vision quantization (Olive) +├── cuda/ +│ ├── text.json # k_quant_mixed INT4 text decoder config (Olive/ModelBuilder) +│ ├── vision_embedding_export.json # Vision+embedding export (Olive/MobiusBuilder, FP16) +│ └── vision.json # INT8 vision quantization (Olive) +├── webgpu/ +│ ├── text.json # k_quant_mixed INT4 text decoder config (Olive/ModelBuilder) +│ ├── vision_embedding_export.json # Vision+embedding export (Olive/MobiusBuilder, FP16) +│ └── vision.json # INT8 vision quantization (Olive) +├── optimize.py # Export orchestrator (all-Olive pipeline) +├── inference.py # ORT GenAI inference (text + VLM) +├── eval.py # AI2D benchmark evaluation +├── requirements.txt +├── info.yml +└── README.md +``` + +> **Note:** Unlike Qwen VLM recipes (which use Olive for all 3 sub-models end-to-end via PyTorch export), +> Ministral uses the **Olive MobiusBuilder pass** (`vision_embedding_export.json`) for vision and embedding +> ONNX export, then **Olive INT8 quantization** (`vision.json`) for vision. +> Embedding stays FP16 (gpu/webgpu) or FP32 (cpu_and_mobile). + +## Differences from Qwen VLM Recipes + +Qwen VLM recipes export all three sub-models through Olive using JSON configs +(`text.json`, `vision.json`). Each JSON defines a multi-pass +pipeline: PyTorch export → graph surgery → ORT fusion → quantization/FP16. + +This recipe takes a different approach for **vision and embedding**: + +| Component | Qwen | Ministral | Why | +|-----------|------|-----------|-----| +| Text decoder | Olive/ModelBuilder (`text.json`) | Olive/ModelBuilder (`text.json`) | Same — ModelBuilder handles GQA + quantization | +| Vision encoder | Olive: PyTorch export + 5-6 passes | **Olive/MobiusBuilder** (`vision_embedding_export.json`) + Olive INT8 (`vision.json`) | Pixtral's dynamic image dims break `torch.onnx.export` | +| Embedding | Olive: PyTorch export + 5 passes | **Olive/MobiusBuilder** export (FP16/FP32, no quantization) | Olive's GatherBlockQuantized has data format bugs | + +**Why does Ministral use MobiusBuilder instead of standard Olive export?** The Olive +`MobiusBuilder` pass constructs the ONNX graph declaratively (via the +[mobius](https://github.com/onnxruntime/mobius) library internally) rather than +tracing through PyTorch. The resulting models already contain the graph optimizations +that Qwen's Olive passes spend 5-6 steps creating: + +- **Fused operators:** `MultiHeadAttention`, `SkipSimplifiedLayerNormalization`, + `RotaryEmbedding` — already present in MobiusBuilder output (Qwen achieves these via + `OrtTransformersOptimization`) +- **FP16 weights:** all 840M vision params exported as FP16 directly (Qwen + converts from FP32 via `OnnxFloatToFloat16`) +- **Clean graph:** 0 Gemm nodes, 0 redundant Cast chains (Qwen cleans these + via `GemmToMatMulAdd` and `OnnxPeepholeOptimizer`) +- **No PyTorch export artifacts:** no `PackedAttentionToLoopMHA` surgery needed + since MobiusBuilder doesn't go through dynamo + +**What Olive still handles:** `vision.json` applies +`OnnxBlockWiseRtnQuantization` (INT8) to the MobiusBuilder-exported FP16 vision model +for all targets (cuda, webgpu, cpu_and_mobile). + +**Why optimize.py has more lines (~400) than Qwen (~170):** + +| Code section | Lines | Why it can't be JSON-driven | +|---|---|---| +| `export_vision_and_embedding()` | ~55 | Runs Olive/MobiusBuilder then reorganizes flat output into subdirectory layout expected by quantization pass | +| `update_genai_config()` | ~150 | Olive generates decoder config only; VLM 3-model config + transforms-based processor_config has no Olive pass | +| `quantize_vision_and_embedding()` | ~25 | Post-export INT8 on pre-built ONNX (Olive JSON-driven, but needs orchestration + cleanup) | +| `fix_tokenizer()` | ~15 | No Olive tokenizer patching pass | + +The text decoder export (`text.json`) and INT8 quantization (`vision.json`) ARE Olive JSON-driven — identical to Qwen. + +## Known Limitations + +- **CPU vision: language drift on some images.** The quantized vision encoder occasionally produces embeddings that cause the text decoder to respond in the wrong language (e.g., Chinese instead of English). This has been observed on specific test images and is a known artifact of vision quantization. INT8 significantly reduces this compared to INT4. +- **CUDA vision quantization is parameter-sensitive.** Symmetric block-128 INT8 vision caused a large quality drop (56.40% on 500 AI2D samples). The CUDA recipe uses asymmetric block-32 INT8 vision, which recovered the result to 73.00% and closely tracks the unquantized Mobius FP16 vision package. +- **FP8 checkpoint requires special kernels.** This recipe defaults to the `-BF16` checkpoint. The FP8 checkpoint can require CUDA kernels that are not available on all machines. + +## Notes + +- **Multi-image supported.** The runtime supports variable-count multi-image inputs via PixtralImageSizes metadata. Requires onnxruntime-extensions ≥ PR #1050 and models exported with PixtralImageSizes in `processor_config.json`. + +- **CPU pipeline**: MobiusBuilder exports FP16 as an intermediate format. Olive then quantizes vision to INT8. For CPU deployment, the cpu_and_mobile JSON configs set `precision: fp32` so embedding outputs float32 natively (CPU EP promotes FP16 to FP32, which causes genai dtype mismatches). The `--dtype` flag is accepted for backward compatibility but does not control export precision — precision is set in the JSON config files. +- **CUDA/WebGPU pipeline**: MobiusBuilder exports FP16 directly for vision/embedding. Olive quantizes vision to asymmetric block-32 INT8. Text decoder uses k_quant_mixed INT4 via ModelBuilder. +- The FP8 Hugging Face checkpoint uses quantized weights. Use the default `-BF16` checkpoint unless you specifically need to test FP8 export behavior. +- The tokenizer uses `TokenizersBackend` class which genai doesn't support. The optimize script fixes this to `LlamaTokenizer`. +- Pixtral vision supports dynamic image sizes (multiples of 28, up to 1540×1540). +- The text decoder includes `llama_4_attn_scale` for long-context attention (>16K tokens). diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json new file mode 100644 index 00000000..3674fc7d --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/text.json @@ -0,0 +1,18 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "mistralai/Ministral-3-3B-Instruct-2512-BF16" + }, + "passes": { + "convert": { + "type": "ModelBuilder", + "precision": "int4", + "int4_algo_config": "k_quant_mixed", + "extra_options": { + "filename": "model.onnx" + } + } + }, + "no_artifacts": true, + "output_dir": "cpu_and_mobile/models/decoder" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json new file mode 100644 index 00000000..d0738655 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision.json @@ -0,0 +1,19 @@ +{ + "input_model": { + "type": "ONNXModel", + "model_path": "cpu_and_mobile/models/vision_encoder/model.onnx" + }, + "passes": { + "int8": { + "type": "OnnxBlockWiseRtnQuantization", + "bits": 8, + "block_size": 128, + "is_symmetric": true, + "accuracy_level": 4, + "save_as_external_data": true, + "external_data_name": "model.onnx.data" + } + }, + "no_artifacts": true, + "output_dir": "cpu_and_mobile/models/vision" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision_embedding_export.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision_embedding_export.json new file mode 100644 index 00000000..d6b8b3a3 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cpu_and_mobile/vision_embedding_export.json @@ -0,0 +1,16 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "mistralai/Ministral-3-3B-Instruct-2512-BF16" + }, + "passes": { + "export": { + "type": "MobiusBuilder", + "precision": "fp32", + "runtime": "none", + "components_to_export": ["vision_encoder", "embedding"] + } + }, + "no_artifacts": true, + "output_dir": "cpu_and_mobile/models" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json new file mode 100644 index 00000000..3c2dda3e --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/text.json @@ -0,0 +1,31 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "mistralai/Ministral-3-3B-Instruct-2512-BF16" + }, + "passes": { + "convert": { + "type": "ModelBuilder", + "precision": "int4", + "int4_algo_config": "k_quant_mixed", + "extra_options": { + "filename": "model.onnx" + } + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "CUDAExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "cuda/models/decoder" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/vision.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/vision.json new file mode 100644 index 00000000..8fce7365 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/vision.json @@ -0,0 +1,31 @@ +{ + "input_model": { + "type": "ONNXModel", + "model_path": "cuda/models/vision_encoder/model.onnx" + }, + "passes": { + "int8": { + "type": "OnnxBlockWiseRtnQuantization", + "bits": 8, + "block_size": 32, + "is_symmetric": false, + "save_as_external_data": true, + "external_data_name": "model.onnx.data" + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "CUDAExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "cuda/models/vision" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/vision_embedding_export.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/vision_embedding_export.json new file mode 100644 index 00000000..d3d48cd6 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/cuda/vision_embedding_export.json @@ -0,0 +1,29 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "mistralai/Ministral-3-3B-Instruct-2512-BF16" + }, + "passes": { + "export": { + "type": "MobiusBuilder", + "precision": "fp16", + "runtime": "none", + "components_to_export": ["vision_encoder", "embedding"] + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "CUDAExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "cuda/models" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py new file mode 100644 index 00000000..0b1b5dca --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/eval.py @@ -0,0 +1,648 @@ +"""Evaluate Ministral-3-3B VLM (ONNX) vs PyTorch on AI2D (diagram understanding). + +AI2D is a multiple-choice visual QA benchmark on scientific diagrams. +Each sample has an image, a question, four answer options, and a ground-truth answer. +Accuracy is the fraction of questions answered with the correct option letter. + +Expected precision gaps (ONNX vs PyTorch reference): + CPU + FP32 → expect ~0 pp gap (exact parity) + CUDA + FP16 → expect <2 pp gap (FP16 precision loss) + CPU + INT4 → expect <5 pp gap (quantization loss) + +Usage: + # CPU INT4 model (default) + python eval.py --device cpu --model_path cpu_and_mobile/models + + # CUDA FP16 model + python eval.py --device cuda --model_path cuda/models + + # Compare ONNX vs PyTorch reference + python eval.py --pytorch_model mistralai/Ministral-3-3B-Instruct-2512-BF16 + + # Larger sample + python eval.py --num_samples 200 +""" + +from __future__ import annotations + +import argparse +import io +import json +import os +import re +import tempfile +import time + +NUMBERS = ["1", "2", "3", "4"] + +# Expected accuracy gap thresholds (percentage points) by precision. +# These help users quickly assess whether a model export is healthy. +EXPECTED_GAP_PP = { + "fp32": 0.0, + "fp16": 2.0, + "int4": 5.0, +} + +DEFAULT_SYSTEM_PROMPT = ( + "You are a concise multiple-choice answering assistant. " + "When given a question with numbered options, respond with ONLY a single digit (1, 2, 3, or 4). " + "Do not include any explanation, reasoning, or other text — just the digit." +) + + +def _load_ort_genai(): + import onnxruntime_genai as og + + return og + + +def _patch_ministral3_text_config_mapping(): + """Register the Ministral text config alias for Transformers releases that need it.""" + try: + from transformers import AutoConfig + + try: + from transformers.models.ministral3.configuration_ministral3 import Ministral3Config + + config_cls = Ministral3Config + except ImportError: + from transformers.models.mistral.configuration_mistral import MistralConfig + + config_cls = MistralConfig + + try: + AutoConfig.register("ministral3", config_cls) + except ValueError as e: + if "already" not in str(e).lower(): + raise + except Exception as e: + print(f" [WARN] could not register ministral3 config alias: {e}") + + +def _set_missing_sliding_window(config) -> None: + if config is None: + return + if getattr(config, "sliding_window", None) is None: + max_position_embeddings = getattr(config, "max_position_embeddings", None) + if max_position_embeddings is not None: + config.sliding_window = max_position_embeddings + + +# --------------------------------------------------------------------------- +# Prompt helpers +# --------------------------------------------------------------------------- + + +def build_messages(question: str, options: list[str], system_prompt: str = "") -> str: + """Return a JSON-encoded chat messages list (for apply_chat_template). + + Uses string content with [IMG] prefix instead of structured content + because ORT GenAI's Jinja does not support the sort() filter needed + by Mistral3's structured-content template path. + """ + option_text = "\n".join(f"{N}. {o}" for N, o in zip(NUMBERS, options)) + content = ( + f"[IMG]Look at the diagram and answer the multiple-choice question.\n\n" + f"Question: {question}\n\n" + f"Options:\n{option_text}\n\n" + f"Reply with the number only (1, 2, 3, or 4)." + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": content}) + return json.dumps(messages) + + +def parse_answer(text: str) -> str | None: + """Extract the first 1/2/3/4 digit from a model response.""" + text = text.strip() + m = re.search(r"\b([1-4])\b", text) + if m: + return m.group(1) + for ch in text: + if ch in NUMBERS: + return ch + return None + + +def ground_truth_number(sample: dict) -> str | None: + """Normalise the dataset's answer field to a 1-based number string. + + AI2D stores answer as a 0-based integer index into the options list. + We map: index 0 → '1', 1 → '2', 2 → '3', 3 → '4'. + """ + answer = sample.get("answer", "") + try: + idx = int(answer) + if 0 <= idx < 4: + return NUMBERS[idx] + except (ValueError, TypeError): + pass # answer is not a valid integer index — return None below + return None + + +# --------------------------------------------------------------------------- +# Precision detection +# --------------------------------------------------------------------------- + + +def detect_onnx_precision(model_path: str) -> str: + """Infer ONNX model precision from the model directory or genai_config. + + Heuristics (in order): + 1. If decoder/model.onnx contains MatMulNBits → 'int4' + 2. If genai_config.json exists and contains model builder metadata → use it + 3. If path contains 'int4' → 'int4' + 4. If path contains 'cpu_and_mobile' → 'int4' (default for CPU target) + 5. If path contains 'cuda' or 'fp16' → 'fp16' + 6. Fallback → 'fp16' + """ + path_lower = model_path.lower() + + decoder_path = os.path.join(model_path, "decoder", "model.onnx") + if os.path.exists(decoder_path): + try: + import onnx + + model = onnx.load(decoder_path, load_external_data=False) + if any(node.op_type == "MatMulNBits" for node in model.graph.node): + return "int4" + except Exception: + pass # graph unavailable — fall through to metadata/path heuristics + + # Check genai_config for precision hints + config_path = os.path.join(model_path, "genai_config.json") + if os.path.exists(config_path): + try: + with open(config_path) as f: + config = json.load(f) + # ModelBuilder writes precision into the config + decoder_cfg = config.get("model", {}).get("decoder", {}) + if "int4" in json.dumps(decoder_cfg).lower(): + return "int4" + except (json.JSONDecodeError, OSError): + pass # config unreadable — fall through to path-based heuristic + + if "int4" in path_lower: + return "int4" + if "cpu_and_mobile" in path_lower: + return "int4" + if "fp16" in path_lower or "cuda" in path_lower: + return "fp16" + return "fp16" + + +# --------------------------------------------------------------------------- +# Dataset helpers +# --------------------------------------------------------------------------- + + +def pil_from_sample(sample: dict) -> Image.Image | None: + """Return PIL image from a dataset sample regardless of field format.""" + from PIL import Image + + img = sample.get("image") + if img is None: + return None + if isinstance(img, Image.Image): + return img.convert("RGB") + if isinstance(img, bytes): + return Image.open(io.BytesIO(img)).convert("RGB") + if isinstance(img, dict) and "bytes" in img: + return Image.open(io.BytesIO(img["bytes"])).convert("RGB") + return None + + +def load_ai2d(num_samples: int): + """Load a deterministic subset of AI2D test samples.""" + from datasets import load_dataset + + print(f"Loading AI2D dataset ({num_samples} samples)…") + ds = load_dataset("lmms-lab/ai2d", split="test") + ds = ds.select(range(min(num_samples, len(ds)))) + print(f" Loaded {len(ds)} samples.") + return ds + + +# --------------------------------------------------------------------------- +# ONNX inference +# --------------------------------------------------------------------------- + + +def build_onnx_runner(model_path: str): + """Load ONNX model with ORT GenAI.""" + og = _load_ort_genai() + print(f"\nLoading ONNX model from: {model_path}") + model = og.Model(model_path) + processor = model.create_multimodal_processor() + tokenizer = og.Tokenizer(model) + print(" ONNX model loaded.") + return model, processor, tokenizer + + +def run_onnx( + model, processor, tokenizer, pil_image: Image.Image, messages_json: str +) -> tuple[str, float]: + """Run a single inference with the ONNX GenAI model. + + Returns (decoded_text, ttft) where ttft is the time in seconds for + the first generate_next_token() call (time to first token). + """ + og = _load_ort_genai() + with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f: + pil_image.save(f, format="PNG") + tmp_path = f.name + + try: + images = og.Images.open(tmp_path) + prompt = tokenizer.apply_chat_template( + messages_json, add_generation_prompt=True + ) + inputs = processor(prompt, images=images) + + params = og.GeneratorParams(model) + params.set_search_options(max_length=8192, do_sample=False) + + generator = og.Generator(model, params) + generator.set_inputs(inputs) + + tokens = [] + ttft = 0.0 + while not generator.is_done(): + t_tok = time.perf_counter() + generator.generate_next_token() + if not tokens: + ttft = time.perf_counter() - t_tok + tokens.append(generator.get_next_tokens()[0]) + del generator + + return tokenizer.decode(tokens), ttft + finally: + os.unlink(tmp_path) + + +# --------------------------------------------------------------------------- +# PyTorch inference +# --------------------------------------------------------------------------- + + +def build_pytorch_runner(model_id: str, device: str = "auto"): + """Load HuggingFace PyTorch model for comparison. + + Args: + model_id: HuggingFace model ID or local path. + device: 'cpu', 'cuda', or 'auto' (auto-detect). + """ + print(f"\nLoading PyTorch model: {model_id}") + import torch + from transformers import AutoProcessor, Mistral3ForConditionalGeneration + + _patch_ministral3_text_config_mapping() + + if device == "auto": + device = "cuda" if torch.cuda.is_available() else "cpu" + dtype = torch.float16 if device == "cuda" else torch.float32 + precision_label = "fp16" if device == "cuda" else "fp32" + print(f" Device: {device}, dtype: {dtype} ({precision_label})") + + # Load as bfloat16 first — FP8 weights stay as-is for manual dequant. + pt_model = Mistral3ForConditionalGeneration.from_pretrained( + model_id, torch_dtype=torch.bfloat16, trust_remote_code=True + ) + + # Dequantize FP8 weights if present (Ministral-3-3B ships FP8-only). + # The HF finegrained-fp8 Triton kernel may not be available, so we + # manually dequantize: weight_bf16 * scale_inv_bf16. + fp8_count = 0 + for name, module in pt_model.named_modules(): + if isinstance(module, torch.nn.Linear) and module.weight.dtype == torch.float8_e4m3fn: + scale_inv = getattr(module, "weight_scale_inv", None) + if scale_inv is not None: + dequantized = module.weight.to(torch.bfloat16) + module.weight = torch.nn.Parameter( + dequantized * scale_inv.to(torch.bfloat16).reshape(-1, 1), + requires_grad=False, + ) + fp8_count += 1 + if fp8_count > 0: + print(f" Dequantized {fp8_count} FP8 linear layers to bfloat16") + + _set_missing_sliding_window(getattr(pt_model.config, "text_config", None)) + _set_missing_sliding_window(getattr(pt_model.config, "vision_config", None)) + language_model = getattr(getattr(pt_model, "model", None), "language_model", None) + _set_missing_sliding_window(getattr(language_model, "config", None)) + + # Cast entire model to target dtype and move to device + pt_model = pt_model.to(dtype=dtype, device=device) + pt_proc = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) + print(" PyTorch model loaded.") + return pt_model, pt_proc, device, precision_label + + +def run_pytorch( + pt_model, + pt_proc, + pil_image: Image.Image, + question: str, + options: list[str], + device: str, + system_prompt: str = "", +) -> tuple[str, None]: + """Run a single inference with the HuggingFace PyTorch model. + + Returns (decoded_text, None). TTFT is not measured for PyTorch. + """ + import torch + + option_text = "\n".join(f"{N}. {o}" for N, o in zip(NUMBERS, options)) + content = ( + f"Look at the diagram and answer the multiple-choice question.\n\n" + f"Question: {question}\n\n" + f"Options:\n{option_text}\n\n" + f"Reply with the number only (1, 2, 3, or 4)." + ) + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append( + { + "role": "user", + "content": [ + {"type": "image", "image": pil_image}, + {"type": "text", "text": content}, + ], + } + ) + text = pt_proc.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + inputs = pt_proc( + text=[text], images=[pil_image], padding=True, return_tensors="pt" + ).to(device) + inputs.pop("token_type_ids", None) + + model_dtype = next(pt_model.parameters()).dtype + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(dtype=model_dtype) + + with torch.no_grad(): + out = pt_model.generate(**inputs, max_new_tokens=8, do_sample=False) + + out_ids = out[0][inputs["input_ids"].shape[-1] :] + return pt_proc.decode(out_ids, skip_special_tokens=True), None + + +# --------------------------------------------------------------------------- +# Model size +# --------------------------------------------------------------------------- + + +def calculate_model_size(model_path: str) -> int: + """Calculate total size of all files under model_path in bytes.""" + total = 0 + for dirpath, _dirnames, filenames in os.walk(model_path): + for filename in filenames: + total += os.path.getsize(os.path.join(dirpath, filename)) + return total + + +def format_model_size(size_bytes: int) -> str: + """Format byte count as a human-readable string (e.g. '1.6 GB', '210 MB').""" + if size_bytes >= 1e9: + return f"{size_bytes / 1e9:.2f} GB" + if size_bytes >= 1e6: + return f"{size_bytes / 1e6:.1f} MB" + return f"{size_bytes / 1e3:.1f} KB" + + +# --------------------------------------------------------------------------- +# Evaluation loop +# --------------------------------------------------------------------------- + + +def evaluate(dataset, runner_fn, label: str) -> dict: + """Run evaluation on a dataset with the given runner function. + + runner_fn must return (text, ttft_or_none) where ttft is seconds for + the first token (None when not measured, e.g. PyTorch). + """ + correct = 0 + skipped = 0 + total = len(dataset) + latencies = [] + ttfts = [] + + print(f"\n{'=' * 60}") + print(f" Evaluating: {label} ({total} samples)") + print(f"{'=' * 60}") + + for i, sample in enumerate(dataset): + gt = ground_truth_number(sample) + if gt is None: + skipped += 1 + continue + + pil_image = pil_from_sample(sample) + if pil_image is None: + skipped += 1 + continue + + question = sample.get("question", "") + options = sample.get("options", []) + if len(options) < 2: + skipped += 1 + continue + + try: + t0 = time.perf_counter() + raw, ttft = runner_fn(pil_image, question, options) + elapsed = time.perf_counter() - t0 + latencies.append(elapsed) + if ttft is not None: + ttfts.append(ttft) + except Exception as e: + print(f" [WARN] sample {i}: {e}") + skipped += 1 + continue + + pred = parse_answer(raw) + hit = pred == gt + + if (i + 1) % 10 == 0 or i == 0: + print( + f" [{i + 1:4d}/{total}] gt={gt} pred={pred} raw={raw.strip()!r:20} " + f"{'✓' if hit else '✗'} running_acc={(correct + (1 if hit else 0)) / (i + 1 - skipped + 1e-9):.3f}" + ) + + if hit: + correct += 1 + + evaluated = total - skipped + accuracy = correct / evaluated if evaluated > 0 else 0.0 + avg_lat = sum(latencies) / len(latencies) if latencies else 0.0 + avg_ttft = sum(ttfts) / len(ttfts) if ttfts else None + + print( + f"\n {label}: {correct}/{evaluated} correct | " + f"accuracy = {accuracy:.4f} ({accuracy * 100:.2f}%)" + ) + print(f" avg latency per sample: {avg_lat:.2f}s | skipped: {skipped}") + if avg_ttft is not None: + print(f" avg TTFT: {avg_ttft * 1000:.1f}ms") + return { + "label": label, + "accuracy": accuracy, + "correct": correct, + "evaluated": evaluated, + "avg_latency_s": avg_lat, + "avg_ttft_ms": avg_ttft * 1000 if avg_ttft is not None else None, + "skipped": skipped, + } + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + + +def main(): + parser = argparse.ArgumentParser( + description="Eval ONNX vs PyTorch Ministral-3-3B VLM on AI2D" + ) + parser.add_argument( + "--model_path", + default="cpu_and_mobile/models", + help="Path to ONNX model dir (default: cpu_and_mobile/models/)", + ) + parser.add_argument( + "--pytorch_model", + default=None, + help="HuggingFace model ID for PyTorch comparison", + ) + parser.add_argument( + "--num_samples", + type=int, + default=100, + help="Number of AI2D test samples to evaluate (default: 100)", + ) + parser.add_argument( + "--device", + choices=["cpu", "cuda", "auto"], + default="auto", + help="Device for inference: cpu, cuda, or auto-detect (default: auto)", + ) + parser.add_argument( + "--skip_onnx", + action="store_true", + help="Skip ONNX evaluation", + ) + parser.add_argument( + "--system_prompt", + default=DEFAULT_SYSTEM_PROMPT, + help="System prompt to suppress chain-of-thought. Pass empty string to disable.", + ) + args = parser.parse_args() + + ds = load_ai2d(args.num_samples) + results = [] + + sys_prompt = args.system_prompt + if sys_prompt: + print(f"\nSystem prompt: {sys_prompt!r}") + else: + print("\nSystem prompt: (none)") + + onnx_precision = None + model_size_str = "N/A" + if not args.skip_onnx: + # Detect ONNX precision from model path + onnx_precision = detect_onnx_precision(args.model_path) + + # Calculate model size + model_size_bytes = calculate_model_size(args.model_path) + model_size_str = format_model_size(model_size_bytes) + print(f"\nModel size: {model_size_str}") + else: + print("\nONNX evaluation: skipped") + + # ---- ONNX ---- + if not args.skip_onnx: + onnx_model, onnx_proc, onnx_tok = build_onnx_runner(args.model_path) + + def onnx_runner(pil_image, question, options): + msgs = build_messages(question, options, sys_prompt) + return run_onnx(onnx_model, onnx_proc, onnx_tok, pil_image, msgs) + + onnx_label = f"ONNX ({onnx_precision.upper()}) @ {args.model_path}" + results.append(evaluate(ds, onnx_runner, onnx_label)) + + # ---- PyTorch (optional) ---- + pt_precision = None + if args.pytorch_model: + pt_model, pt_proc, pt_device, pt_precision = build_pytorch_runner( + args.pytorch_model, device=args.device + ) + + def pt_runner(pil_image, question, options): + return run_pytorch( + pt_model, pt_proc, pil_image, question, options, pt_device, sys_prompt + ) + + pt_label = f"PyTorch ({pt_precision.upper()}) @ {args.pytorch_model}" + results.append(evaluate(ds, pt_runner, pt_label)) + + # ---- Summary ---- + print(f"\n{'=' * 60}") + print(" EVALUATION SUMMARY") + print(f"{'=' * 60}") + print(" Model : Ministral-3-3B-Instruct-2512 (VLM)") + print(" Dataset : AI2D (science diagram QA, multiple choice)") + print(f" Samples : {args.num_samples}") + if not args.skip_onnx: + print(f" Model size : {model_size_str}") + if onnx_precision: + print(f" ONNX prec : {onnx_precision.upper()}") + if pt_precision: + print(f" PyTorch prec: {pt_precision.upper()}") + print( + f" System prompt: " + f"{'(none)' if not sys_prompt else sys_prompt[:80] + ('...' if len(sys_prompt) > 80 else '')}" + ) + print() + for r in results: + print(f" {r['label']}") + print( + f" Accuracy : {r['accuracy'] * 100:.2f}% ({r['correct']}/{r['evaluated']})" + ) + print(f" Avg lat : {r['avg_latency_s']:.2f}s/sample") + if r.get("avg_ttft_ms") is not None: + print(f" Avg TTFT : {r['avg_ttft_ms']:.1f}ms") + print() + + if len(results) == 2: + delta = results[0]["accuracy"] - results[1]["accuracy"] + abs_delta = abs(delta) * 100 + print(f" Accuracy delta (ONNX - PyTorch): {delta * 100:+.2f} pp") + print( + f" Speedup (PyTorch lat / ONNX lat): " + f"{results[1]['avg_latency_s'] / max(results[0]['avg_latency_s'], 1e-9):.2f}x" + ) + + # Precision gap assessment + expected_gap = EXPECTED_GAP_PP.get(onnx_precision or "int4", 5.0) + print() + print(f" Expected gap for {onnx_precision.upper()}: <{expected_gap:.0f} pp") + if abs_delta <= expected_gap: + print(f" ✓ PASS — {abs_delta:.2f} pp gap is within expected range") + else: + print( + f" ✗ WARN — {abs_delta:.2f} pp gap exceeds expected {expected_gap:.0f} pp for {onnx_precision.upper()}" + ) + print( + " This may indicate a quality regression in the export pipeline." + ) + + +if __name__ == "__main__": + main() diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/inference.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/inference.py new file mode 100644 index 00000000..1b799c1b --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/inference.py @@ -0,0 +1,163 @@ +"""ONNX Runtime GenAI inference for Ministral-3-3B vision-language model. + +Usage: + python inference.py --prompt "What is the capital of France?" + python inference.py --image photo.jpg --prompt "Describe this image" + python inference.py --interactive + python inference.py --model_path cuda/models --prompt "Hello" +""" + +import argparse +import json +import time + +import onnxruntime_genai as og + + +def main(): + parser = argparse.ArgumentParser( + description="ONNX Runtime GenAI inference for Ministral-3-3B" + ) + parser.add_argument( + "--model_path", + type=str, + default="cpu_and_mobile/models", + help="Path to model directory containing genai_config.json and ONNX models", + ) + parser.add_argument("--image", type=str, default=None, help="Path to image file") + parser.add_argument("--prompt", type=str, default=None, help="Text prompt") + parser.add_argument( + "--max_length", type=int, default=4096, help="Maximum total tokens" + ) + parser.add_argument( + "--interactive", action="store_true", help="Run in interactive mode" + ) + args = parser.parse_args() + + print(f"Loading model from: {args.model_path}") + model = og.Model(args.model_path) + processor = model.create_multimodal_processor() + tokenizer = og.Tokenizer(model) + tokenizer_stream = processor.create_stream() + + if args.interactive: + interactive_mode(model, processor, tokenizer, tokenizer_stream, args) + elif args.prompt: + generate_response( + model, + processor, + tokenizer, + tokenizer_stream, + args.prompt, + args.image, + args.max_length, + ) + else: + print("Please provide --prompt or --interactive") + parser.print_help() + + +def generate_response( + model, processor, tokenizer, tokenizer_stream, prompt, image_path, max_length=4096 +): + """Run a single generation.""" + images = None + if image_path: + print(f"Loading image: {image_path}") + images = og.Images.open(image_path) + messages = [ + { + "role": "user", + "content": [{"type": "image"}, {"type": "text", "text": prompt}], + } + ] + else: + messages = [{"role": "user", "content": prompt}] + + full_prompt = tokenizer.apply_chat_template( + json.dumps(messages), add_generation_prompt=True + ) + print(f"\nPrompt: {prompt}") + if image_path: + print(f"Image: {image_path}") + print("\nGenerating response...") + + inputs = processor(full_prompt, images=images) + params = og.GeneratorParams(model) + params.set_search_options(max_length=max_length) + + generator = og.Generator(model, params) + generator.set_inputs(inputs) + + token_count = 0 + ttft = None + t_start = time.perf_counter() + + print("\nResponse: ", end="", flush=True) + while not generator.is_done(): + generator.generate_next_token() + if ttft is None: + ttft = time.perf_counter() - t_start + token_count += 1 + new_token = generator.get_next_tokens()[0] + print(tokenizer_stream.decode(new_token), end="", flush=True) + + t_total = time.perf_counter() - t_start + print() + del generator + + decode_tokens = max(token_count - 1, 1) + decode_time = t_total - (ttft or 0) + tps = decode_tokens / decode_time if decode_time > 0 else 0 + + print(f"\n Tokens generated : {token_count}") + print(f" TTFT : {(ttft or 0) * 1000:.1f} ms") + print(f" Decode TPS : {tps:.1f} tokens/sec") + print(f" Total time : {t_total:.2f} s") + + +def interactive_mode(model, processor, tokenizer, tokenizer_stream, args): + """Run in interactive mode.""" + print("\n" + "=" * 50) + print("Interactive Mode - Enter 'quit' to stop") + print("To include an image: image:/path/to/image.jpg your prompt") + print("=" * 50 + "\n") + + while True: + try: + user_input = input("You: ").strip() + except EOFError: + break + + if user_input.lower() in ("quit", "exit"): + break + if not user_input: + continue + + image_path = None + prompt = user_input + if user_input.startswith("image:"): + parts = user_input.split(" ", 1) + image_path = parts[0][6:] + prompt = parts[1] if len(parts) > 1 else "Describe this image" + + try: + generate_response( + model, + processor, + tokenizer, + tokenizer_stream, + prompt, + image_path, + args.max_length, + ) + except Exception as e: + print(f"Error: {e}") + + print("-" * 50 + "\n") + + print("Goodbye!") + + +if __name__ == "__main__": + main() diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/info.yml b/mistralai-Ministral-3-3B-Instruct-2512/builtin/info.yml new file mode 100644 index 00000000..eb13b97a --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/info.yml @@ -0,0 +1,12 @@ +keywords: + - olive-ai +recipes: + - name: ministral_3_3b + file: optimize.py + eps: + - CPUExecutionProvider + - CUDAExecutionProvider + - WebGpuExecutionProvider + devices: + - cpu + - gpu diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py new file mode 100644 index 00000000..3f9cb678 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/optimize.py @@ -0,0 +1,548 @@ +"""End-to-end optimization pipeline for Ministral-3-3B ONNX models. + +Uses Olive MobiusBuilder pass for vision and embedding export (reliable +dynamo-free ONNX construction), and Olive/ModelBuilder for text decoder +export (GQA + INT4). + +Pipeline: + 1. Text decoder: Olive/ModelBuilder (k_quant_mixed INT4) + 2. Vision + embedding: Olive/MobiusBuilder (FP16 for cuda/webgpu, FP32 for cpu_and_mobile, via vision_embedding_export.json) + 3. Vision quantization: Olive (INT8 RTN, per vision.json) + +Architecture difference from Qwen VLM recipes: + Qwen uses Olive passes for all 3 sub-models (export + optimization). + Ministral uses MobiusBuilder for vision/embedding because Pixtral's dynamic + image dimensions cause torch.onnx.export/dynamo failures. MobiusBuilder + produces already-optimized graphs (fused MHA, SkipLayerNorm, FP16). + +Usage: + python optimize.py --config-dir cuda --device gpu + python optimize.py --config-dir cpu_and_mobile --device cpu + python optimize.py --config-dir webgpu --device webgpu + python optimize.py --config-dir cuda --device gpu --skip-export + python optimize.py --config-dir cpu_and_mobile --device cpu --model-path /local/dequantized/checkpoint +""" + +import argparse +import json +import logging +import os +import shutil +from pathlib import Path + +logging.getLogger("onnxscript").setLevel(logging.WARNING) +logging.getLogger("onnx_ir").setLevel(logging.WARNING) + +MODELS_DIR = "models" +MODEL_NAME = "mistralai/Ministral-3-3B-Instruct-2512-BF16" + +# Lazy-loaded HuggingFace config (avoids import-time network access) +_HF_CONFIG = None + + +def _get_hf_config(): + """Load and cache the HuggingFace model config. + + Always loads from MODEL_NAME rather than + --model-path, because the config values (image_token_id, patch_size, etc.) + are architecture constants that don't change between checkpoints. + """ + global _HF_CONFIG + if _HF_CONFIG is None: + from transformers import Mistral3Config + + _HF_CONFIG = Mistral3Config.from_pretrained(MODEL_NAME) + return _HF_CONFIG + + +def _break_hardlink(path: Path): + """Replace a hard-linked artifact with an independent copy.""" + if not path.exists() or path.stat().st_nlink <= 1: + return + + tmp_path = path.with_name(f".{path.name}.copying") + if tmp_path.exists(): + tmp_path.unlink() + shutil.copy2(path, tmp_path) + os.replace(tmp_path, path) + print(f" [Cleanup] Broke hard link for {path}") + + +def _break_external_data_hardlinks(models_dir: str): + for data_path in Path(models_dir).rglob("*.onnx.data"): + _break_hardlink(data_path) + + +def _remove_unused_root_artifacts(models_dir: str): + for filename in ("decoder.onnx", "decoder.onnx.data", "model_config.json"): + path = Path(models_dir) / filename + if path.exists(): + path.unlink() + print(f" [Cleanup] Removed unused root artifact {path}") + + +def export_text_decoder(config_dir: str, models_dir: str, model_path: str = MODEL_NAME): + """Export text decoder using Olive/ModelBuilder (GQA + quantization). + + Loads text.json as a dict and overrides output_dir to write directly + to /decoder. ModelBuilder also generates genai_config.json, + tokenizer, and chat_template inside decoder/ — we move them to the + models root where the VLM pipeline expects them. + """ + try: + from olive import run + except ImportError: + from olive.workflows import run + + config_path = Path(config_dir) / "text.json" + if not config_path.exists(): + raise FileNotFoundError(f"Text config not found: {config_path}") + + # Load config as dict and override output_dir to write directly to models_dir + with open(config_path) as f: + config = json.load(f) + if model_path != MODEL_NAME: + config["input_model"]["model_path"] = model_path + config["output_dir"] = os.path.join(models_dir, "decoder") + + print(f" [Olive] Exporting text decoder from {config_path}...") + run(config) + + # Move shared configs from decoder/ to models root for VLM pipeline + decoder_dir = Path(models_dir) / "decoder" + decoder_model = decoder_dir / "model.onnx" + if not decoder_model.exists(): + raise RuntimeError(f"Text decoder export did not produce expected model: {decoder_model}") + + for filename in ( + "genai_config.json", + "tokenizer.json", + "tokenizer_config.json", + "chat_template.jinja", + ): + src = decoder_dir / filename + if src.exists(): + shutil.move(str(src), str(Path(models_dir) / filename)) + _break_external_data_hardlinks(models_dir) + + +def export_vision_and_embedding(config_dir: str, models_dir: str, model_path: str = MODEL_NAME): + """Export vision encoder and embedding using Olive MobiusBuilder pass. + + Runs /vision_embedding_export.json which calls MobiusBuilder with + components_to_export=["vision_encoder", "embedding"], writing two + sub-directories under models_dir: + vision_encoder/model.onnx — exported vision encoder, fed into INT8 quantization step + embedding/model.onnx — exported embedding (FP16/FP32, not quantized) + + Mobius constructs the ONNX graph declaratively and applies pretrained + weights, avoiding torch.onnx.export dynamo issues with Pixtral's + dynamic image dimensions. Precision is controlled by the + vision_embedding_export.json config (fp16 for cuda/webgpu, fp32 for cpu_and_mobile). + """ + try: + from olive import run + except ImportError: + from olive.workflows import run + + config_path = Path(config_dir) / "vision_embedding_export.json" + if not config_path.exists(): + raise FileNotFoundError(f"Vision/embedding export config not found: {config_path}") + + with open(config_path) as f: + config = json.load(f) + + # Olive writes output model to output_dir. Override to models_dir so it lands + # alongside decoder/ in the same parent directory. + config["output_dir"] = models_dir + if model_path != MODEL_NAME: + config["input_model"]["model_path"] = model_path + + print(f" [Olive] Exporting vision encoder and embedding from {config_path}...") + run(config) + + # Olive's CompositeModelHandler writes flat ONNX files to output_dir: + # models_dir/vision_encoder.onnx + models_dir/vision_encoder.onnx.data + # models_dir/embedding.onnx + models_dir/embedding.onnx.data + # + # Reorganize into the subdirectory layout expected by quantize_vision_and_embedding: + # models_dir/vision_encoder/model.onnx + {component}.onnx.data + # models_dir/embedding/model.onnx + {component}.onnx.data + # + # The data file keeps its original name (e.g. vision_encoder.onnx.data) so that + # the relative external_data reference baked into model.onnx remains valid. + for component in ("vision_encoder", "embedding"): + src_onnx = Path(models_dir) / f"{component}.onnx" + if src_onnx.exists(): + dst_dir = Path(models_dir) / component + dst_dir.mkdir(exist_ok=True) + shutil.move(str(src_onnx), str(dst_dir / "model.onnx")) + src_data = Path(models_dir) / f"{component}.onnx.data" + if src_data.exists(): + # Keep original filename — model.onnx references it by this relative path + shutil.move(str(src_data), str(dst_dir / f"{component}.onnx.data")) + _remove_unused_root_artifacts(models_dir) + _break_external_data_hardlinks(models_dir) + + +def quantize_vision_and_embedding(config_dir: str, models_dir: str): + """Apply quantization to Olive-exported vision and embedding models. + + Loads vision.json as a dict and overrides model_path and output_dir. + Vision encoder is sourced from vision_encoder/ (MobiusBuilder output) + and quantized output is written to vision/ (the name ort-genai expects). + + Embedding stays FP16 (cuda/webgpu) or FP32 (cpu_and_mobile) — no quantization needed, no embedding.json. + """ + try: + from olive import run + except ImportError: + from olive.workflows import run + + for component in ("vision", "embedding"): + config_path = Path(config_dir) / f"{component}.json" + if not config_path.exists(): + continue + + # MobiusBuilder outputs vision_encoder/ but ort-genai expects vision/. + # Source from vision_encoder/ and write quantized output to vision/. + source_dir = "vision_encoder" if component == "vision" else component + component_onnx = os.path.join(models_dir, source_dir, "model.onnx") + if not os.path.exists(component_onnx): + print( + f" [WARN] {component_onnx} not found, skipping {component} quantization" + ) + continue + + # Load config as dict and override paths to target models_dir directly + with open(config_path) as f: + config = json.load(f) + config["input_model"]["model_path"] = component_onnx + config["output_dir"] = os.path.join(models_dir, component) + + print(f" [Olive] Quantizing {component} from {config_path}...") + run(config) + + # Olive catches pass failures internally and returns without raising. + # Guard _strip_unused_initializers so a silent quantization failure + # doesn't propagate as a confusing FileNotFoundError. + output_onnx = os.path.join(models_dir, component, "model.onnx") + if not os.path.exists(output_onnx): + print(f" [WARN] Olive produced no output for {component} — quantization failed") + continue + _strip_unused_initializers(output_onnx) + + # Clean up intermediate vision_encoder/ only if vision quantization succeeded. + # If quantization failed, preserve the intermediate for debugging. + vision_dir = os.path.join(models_dir, "vision") + vision_encoder_dir = os.path.join(models_dir, "vision_encoder") + if os.path.exists(os.path.join(vision_dir, "model.onnx")) and os.path.exists(vision_encoder_dir): + print(" Cleaning up intermediate vision_encoder export...") + shutil.rmtree(vision_encoder_dir, ignore_errors=True) + + +def _strip_unused_initializers(onnx_path: str): + """Remove unused initializers and re-save to shrink the external data file. + + Olive's OnnxBlockWiseRtnQuantization keeps original weights alongside + the new quantized weights. Stripping the unused originals typically + reduces the data file by ~87% (e.g., 1.7GB → 220MB for the vision model). + """ + if not os.path.exists(onnx_path): + return + + import onnx + from onnx.external_data_helper import convert_model_to_external_data + + model = onnx.load(onnx_path) + + used_names = set() + for node in model.graph.node: + for inp in node.input: + used_names.add(inp) + for inp in model.graph.input: + used_names.add(inp.name) + + before = len(model.graph.initializer) + new_inits = [init for init in model.graph.initializer if init.name in used_names] + removed = before - len(new_inits) + + if removed == 0: + return + + del model.graph.initializer[:] + model.graph.initializer.extend(new_inits) + + for init in model.graph.initializer: + del init.external_data[:] + + data_name = "model.onnx.data" + data_path = os.path.join(os.path.dirname(onnx_path), data_name) + if os.path.exists(data_path): + os.remove(data_path) + + convert_model_to_external_data( + model, all_tensors_to_one_file=True, location=data_name, size_threshold=1024 + ) + onnx.save(model, onnx_path) + + data_mb = os.path.getsize(data_path) / 1e6 if os.path.exists(data_path) else 0 + print(f" [Cleanup] Stripped {removed} unused initializers → {data_mb:.0f} MB") + + +def export_models( + config_dir: str, model_path: str, dtype: str = "f16", models_dir: str | None = None +): + """Export all 3 sub-models: text (Olive/ModelBuilder), vision + embedding (Olive/MobiusBuilder). + + All outputs go directly to models_dir: + decoder/ — INT4 k_quant text decoder (from text.json / ModelBuilder) + vision_encoder/ — FP16/FP32 vision encoder (from MobiusBuilder, input for INT8 quant) + embedding/ — FP16/FP32 embedding (from MobiusBuilder, not quantized) + vision/ — INT8 quantized vision (from vision.json) + + Note: precision for vision/embedding export is set in vision_embedding_export.json + (fp16 for cuda/webgpu, fp32 for cpu_and_mobile). The --dtype CLI arg is accepted for + backward compatibility but does not control export precision — precision is set in the + JSON config files. + """ + if dtype != "f16": + import warnings + + warnings.warn( + "--dtype is deprecated and has no effect. Export precision is controlled by " + "vision_embedding_export.json in the config directory.", + DeprecationWarning, + stacklevel=2, + ) + if models_dir is None: + models_dir = str(Path(config_dir) / MODELS_DIR) + + print("=== Exporting models ===") + + # Text decoder via Olive/ModelBuilder (GQA + INT4 k_quant) + export_text_decoder(config_dir, models_dir, model_path) + + # Vision encoder + embedding via Olive/MobiusBuilder (FP16 for cuda/webgpu, FP32 for cpu_and_mobile) + export_vision_and_embedding(config_dir, models_dir, model_path) + + # INT8 quantization of vision encoder (embedding stays FP16/FP32) + quantize_vision_and_embedding(config_dir, models_dir) + + print() + + +def update_genai_config(output_dir: str = MODELS_DIR, device: str = "cpu"): + """Patch genai_config.json with embedding/vision sections and processor_config. + + Derives model-specific values from the HuggingFace config (lazily loaded) + to avoid hardcoded constants drifting from the actual checkpoint. + """ + config_path = Path(output_dir) / "genai_config.json" + + with open(config_path) as f: + config = json.load(f) + + if device == "gpu": + # CUDA graph capture is unsupported for VLMs with dynamic image sizes. + # Disable for all models (matches Qwen VLM recipe convention). + provider_options = [ + { + "cuda": { + "enable_cuda_graph": "0", + "enable_skip_layer_norm_strict_mode": "1", + } + } + ] + vision_provider_options = provider_options + elif device == "webgpu": + provider_options = [{"webgpu": {}}] + vision_provider_options = provider_options + else: + provider_options = [] + vision_provider_options = [] + + session_options = { + "log_id": "onnxruntime-genai", + "provider_options": provider_options, + } + vision_session_options = { + "log_id": "onnxruntime-genai", + "provider_options": vision_provider_options, + } + + config["model"]["decoder"]["session_options"] = session_options + config["model"]["decoder"]["filename"] = "decoder/model.onnx" + + # Sync position_ids with what the decoder ONNX model actually supports + decoder_onnx = Path(output_dir) / "decoder" / "model.onnx" + if decoder_onnx.exists(): + import onnx + + decoder_model = onnx.load(str(decoder_onnx), load_external_data=False) + onnx_input_names = {inp.name for inp in decoder_model.graph.input} + if "position_ids" in onnx_input_names: + config["model"]["decoder"].setdefault("inputs", {})["position_ids"] = ( + "position_ids" + ) + else: + config["model"]["decoder"].get("inputs", {}).pop("position_ids", None) + + config["model"]["embedding"] = { + "filename": "embedding/model.onnx", + "inputs": {"input_ids": "input_ids", "image_features": "image_features"}, + "outputs": {"inputs_embeds": "inputs_embeds"}, + "session_options": vision_session_options, + } + + # Vision config — values derived from HF config to stay in sync with checkpoint + hf_config = _get_hf_config() + config["model"]["vision"] = { + "filename": "vision/model.onnx", + "config_filename": "processor_config.json", + "spatial_merge_size": hf_config.spatial_merge_size, + "patch_size": hf_config.vision_config.patch_size, + "inputs": {"pixel_values": "pixel_values"}, + "outputs": {"image_features": "image_features"}, + "session_options": vision_session_options, + } + + # Add VLM-specific fields not generated by ModelBuilder. + # Don't override context_length or max_length — PR #2077's ModelBuilder + # sets these correctly (context_length=262144, max_length=32768). + config["model"]["image_token_id"] = hf_config.image_token_index + + # Override search defaults for VLM: greedy decoding, no KV sharing + config["search"]["top_k"] = 1 + config["search"]["past_present_share_buffer"] = False + + with open(config_path, "w") as f: + json.dump(config, f, indent=4) + print(f" Updated {config_path}") + + # Transforms-based processor config (matches ORT GenAI's image preprocessor format) + processor_config = { + "processor": { + "name": "pixtral_image_processor", + "transforms": [ + { + "operation": { + "name": "decode_image", + "type": "DecodeImage", + "attrs": {"color_space": "RGB"}, + } + }, + { + "operation": { + "name": "convert_to_rgb", + "type": "ConvertRGB", + } + }, + { + "operation": { + "name": "resize", + "type": "Resize", + "attrs": { + "height": 1540, + "width": 1540, + "smart_resize": 1, + "min_pixels": 784, + "max_pixels": 2371600, + "patch_size": hf_config.vision_config.patch_size, + "merge_size": hf_config.spatial_merge_size, + }, + } + }, + { + "operation": { + "name": "rescale", + "type": "Rescale", + "attrs": {"rescale_factor": 0.00392156862745098}, + } + }, + { + "operation": { + "name": "normalize", + "type": "Normalize", + "attrs": { + "mean": [0.48145466, 0.4578275, 0.40821073], + "std": [0.26862954, 0.26130258, 0.27577711], + }, + } + }, + { + "operation": { + "name": "permute", + "type": "Permute3D", + "attrs": {"dims": [2, 0, 1]}, + } + }, + { + "operation": { + "name": "pixtral_image_sizes", + "type": "PixtralImageSizes", + } + }, + ], + } + } + + processor_path = Path(output_dir) / "processor_config.json" + with open(processor_path, "w") as f: + json.dump(processor_config, f, indent=2) + print(f" Created {processor_path}") + _remove_unused_root_artifacts(output_dir) + + +def fix_tokenizer(output_dir: str = MODELS_DIR): + """Fix tokenizer_config.json for onnxruntime-genai compatibility. + + Ministral3's tokenizer uses 'TokenizersBackend' class which isn't supported + by genai's ort-extensions tokenizer. Change to 'LlamaTokenizer'. + """ + tc_path = Path(output_dir) / "tokenizer_config.json" + if tc_path.exists(): + tc = json.loads(tc_path.read_text(encoding="utf-8")) + if tc.get("tokenizer_class") == "TokenizersBackend": + tc["tokenizer_class"] = "LlamaTokenizer" + tc_path.write_text( + json.dumps(tc, indent=2, ensure_ascii=False), encoding="utf-8" + ) + print(" Fixed tokenizer_class to LlamaTokenizer") + + +def main(): + parser = argparse.ArgumentParser(description="Optimize Ministral-3-3B ONNX models") + parser.add_argument("--device", choices=["gpu", "cpu", "webgpu"], default="cpu") + parser.add_argument("--config-dir", default="cpu_and_mobile") + parser.add_argument("--skip-export", action="store_true") + parser.add_argument("--models-dir", default=None) + parser.add_argument( + "--model-path", + default=MODEL_NAME, + help="HuggingFace model ID or local path to dequantized checkpoint", + ) + parser.add_argument( + "--dtype", + default="f16", + choices=["f16", "f32", "bf16"], + help="Quantization precision for the text decoder (INT4 via Olive/ModelBuilder). " + "Does not affect vision/embedding export — precision is set in " + "vision_embedding_export.json (FP16 for cuda/webgpu, FP32 for cpu_and_mobile). (default: f16)", + ) + args = parser.parse_args() + + models_dir = args.models_dir or str(Path(args.config_dir) / MODELS_DIR) + + if not args.skip_export: + export_models(args.config_dir, args.model_path, args.dtype, models_dir) + + print("=== Generating configs ===") + update_genai_config(output_dir=models_dir, device=args.device) + fix_tokenizer(output_dir=models_dir) + print() + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/requirements.txt b/mistralai-Ministral-3-3B-Instruct-2512/builtin/requirements.txt new file mode 100644 index 00000000..d0d9b88d --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/requirements.txt @@ -0,0 +1,4 @@ +git+https://github.com/microsoft/Olive.git@main +git+https://github.com/onnxruntime/mobius.git@41d2641 +torch>=2.10.0,<2.11.0 +transformers>=4.57.0 diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/webgpu/text.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/webgpu/text.json new file mode 100644 index 00000000..a13d01ef --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/webgpu/text.json @@ -0,0 +1,32 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "mistralai/Ministral-3-3B-Instruct-2512-BF16" + }, + "passes": { + "convert": { + "type": "ModelBuilder", + "precision": "int4", + "int4_block_size": 32, + "int4_algo_config": "k_quant_mixed", + "extra_options": { + "filename": "model.onnx" + } + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/decoder" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/webgpu/vision.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/webgpu/vision.json new file mode 100644 index 00000000..5973b8a5 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/webgpu/vision.json @@ -0,0 +1,31 @@ +{ + "input_model": { + "type": "ONNXModel", + "model_path": "webgpu/models/vision_encoder/model.onnx" + }, + "passes": { + "int8": { + "type": "OnnxBlockWiseRtnQuantization", + "bits": 8, + "block_size": 32, + "is_symmetric": false, + "save_as_external_data": true, + "external_data_name": "model.onnx.data" + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models/vision" +} diff --git a/mistralai-Ministral-3-3B-Instruct-2512/builtin/webgpu/vision_embedding_export.json b/mistralai-Ministral-3-3B-Instruct-2512/builtin/webgpu/vision_embedding_export.json new file mode 100644 index 00000000..d4e7e993 --- /dev/null +++ b/mistralai-Ministral-3-3B-Instruct-2512/builtin/webgpu/vision_embedding_export.json @@ -0,0 +1,29 @@ +{ + "input_model": { + "type": "HfModel", + "model_path": "mistralai/Ministral-3-3B-Instruct-2512-BF16" + }, + "passes": { + "export": { + "type": "MobiusBuilder", + "precision": "fp16", + "runtime": "none", + "components_to_export": ["vision_encoder", "embedding"] + } + }, + "engine": { + "target": { + "type": "LocalSystem", + "accelerators": [ + { + "device": "gpu", + "execution_providers": [ + "WebGpuExecutionProvider" + ] + } + ] + } + }, + "no_artifacts": true, + "output_dir": "webgpu/models" +}