From eb00c18f5bd79b86c62d9b4fbf526e3f1036bc74 Mon Sep 17 00:00:00 2001
From: Sean Budd <sean@nvaccess.org>
Date: Fri, 16 Jan 2026 17:27:43 +1100
Subject: [PATCH 1/3] Revert "Revert AI image description work (#19425)"

This reverts commit 9f3aecbb095b9721189f443c300fce1db6f8accf.
---
 .github/workflows/testAndPublish.yml          |   1 +
 pyproject.toml                                |   4 +
 source/NVDAState.py                           |   4 +
 source/_localCaptioner/__init__.py            |  44 +
 source/_localCaptioner/captioner/__init__.py  |  53 ++
 source/_localCaptioner/captioner/base.py      |  24 +
 source/_localCaptioner/captioner/vitGpt2.py   | 382 +++++++++
 source/_localCaptioner/imageDescriber.py      | 214 +++++
 source/_localCaptioner/modelConfig.py         | 277 ++++++
 source/_localCaptioner/modelDownloader.py     | 760 +++++++++++++++++
 source/config/__init__.py                     |   1 +
 source/config/configSpec.py                   |  24 +-
 source/core.py                                |   7 +
 source/globalCommands.py                      |  37 +
 source/gui/__init__.py                        |   5 +
 source/gui/_localCaptioner/__init__.py        |   0
 source/gui/_localCaptioner/messageDialogs.py  | 198 +++++
 source/gui/blockAction.py                     |  13 +
 source/gui/settingsDialogs.py                 |  49 ++
 source/setup.py                               |   7 +-
 .../libraries/SystemTestSpy/configManager.py  |  26 +
 .../libraries/SystemTestSpy/mockModels.py     | 793 ++++++++++++++++++
 .../standard-doLoadMockModel.ini              |  20 +
 .../robot/automatedImageDescriptions.py       |  43 +
 .../robot/automatedImageDescriptions.robot    |  26 +
 tests/unit/test_localCaptioner/__init__.py    |   0
 .../test_localCaptioner/test_captioner.py     | 345 ++++++++
 .../test_localCaptioner/test_downloader.py    | 108 +++
 user_docs/en/changes.md                       |   8 +
 user_docs/en/userGuide.md                     |  41 +-
 uv.lock                                       | 235 ++++--
 31 files changed, 3684 insertions(+), 65 deletions(-)
 create mode 100644 source/_localCaptioner/__init__.py
 create mode 100644 source/_localCaptioner/captioner/__init__.py
 create mode 100644 source/_localCaptioner/captioner/base.py
 create mode 100644 source/_localCaptioner/captioner/vitGpt2.py
 create mode 100644 source/_localCaptioner/imageDescriber.py
 create mode 100644 source/_localCaptioner/modelConfig.py
 create mode 100644 source/_localCaptioner/modelDownloader.py
 create mode 100644 source/gui/_localCaptioner/__init__.py
 create mode 100644 source/gui/_localCaptioner/messageDialogs.py
 create mode 100644 tests/system/libraries/SystemTestSpy/mockModels.py
 create mode 100644 tests/system/nvdaSettingsFiles/standard-doLoadMockModel.ini
 create mode 100644 tests/system/robot/automatedImageDescriptions.py
 create mode 100644 tests/system/robot/automatedImageDescriptions.robot
 create mode 100644 tests/unit/test_localCaptioner/__init__.py
 create mode 100644 tests/unit/test_localCaptioner/test_captioner.py
 create mode 100644 tests/unit/test_localCaptioner/test_downloader.py

diff --git a/.github/workflows/testAndPublish.yml b/.github/workflows/testAndPublish.yml
index f4990055655..928d75875f6 100644
--- a/.github/workflows/testAndPublish.yml
+++ b/.github/workflows/testAndPublish.yml
@@ -401,6 +401,7 @@ jobs:
           - startupShutdown
           - symbols
           - vscode
+          - imageDescriptions
           - chrome_annotations
           - chrome_list
           - chrome_table
diff --git a/pyproject.toml b/pyproject.toml
index 0e41fd3bc15..3fc86063d16 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,6 +48,9 @@ dependencies = [
 	"l2m4m==1.0.4",
 	"pyyaml==6.0.3",
 	"pymdown-extensions==10.17.1",
+	# local image caption
+	"onnxruntime==1.23.2",
+	"numpy==2.3.5",
 ]
 
 [project.urls]
@@ -335,6 +338,7 @@ system-tests = [
 	"robotframework==7.3.2",
 	"robotremoteserver==1.1.1",
 	"robotframework-screencaplibrary==1.6.0",
+	"onnx==1.19.1",
 ]
 unit-tests = [
 	# Creating XML unit test reports
diff --git a/source/NVDAState.py b/source/NVDAState.py
index 6f6b079aab1..f08f227dd83 100644
--- a/source/NVDAState.py
+++ b/source/NVDAState.py
@@ -67,6 +67,10 @@ def voiceDictsBackupDir(self) -> str:
 	def updatesDir(self) -> str:
 		return os.path.join(self.configDir, "updates")
 
+	@property
+	def modelsDir(self) -> str:
+		return os.path.join(self.configDir, "models")
+
 	@property
 	def nvdaConfigFile(self) -> str:
 		return os.path.join(self.configDir, "nvda.ini")
diff --git a/source/_localCaptioner/__init__.py b/source/_localCaptioner/__init__.py
new file mode 100644
index 00000000000..3d55b5c486a
--- /dev/null
+++ b/source/_localCaptioner/__init__.py
@@ -0,0 +1,44 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+from logHandler import log
+
+from .imageDescriber import ImageDescriber
+from . import modelConfig
+
+_localCaptioner: ImageDescriber | None = None
+
+
+def initialize():
+	"""Initialise the local captioner."""
+	global _localCaptioner
+	log.debug("Initializing local captioner")
+	modelConfig.initialize()
+	_localCaptioner = ImageDescriber()
+
+
+def terminate():
+	"""Terminate the local captioner."""
+	global _localCaptioner
+	if _localCaptioner is None:
+		log.error("local captioner not running")
+		return
+	log.debug("Terminating local captioner")
+	_localCaptioner.terminate()
+	_localCaptioner = None
+
+
+def isModelLoaded() -> bool:
+	"""return if model is loaded"""
+	if _localCaptioner is not None:
+		return _localCaptioner.isModelLoaded
+	else:
+		return False
+
+
+def toggleImageCaptioning() -> None:
+	"""do load/unload the model from memory."""
+	if _localCaptioner is not None:
+		_localCaptioner.toggleSwitch()
diff --git a/source/_localCaptioner/captioner/__init__.py b/source/_localCaptioner/captioner/__init__.py
new file mode 100644
index 00000000000..a1f16590a0c
--- /dev/null
+++ b/source/_localCaptioner/captioner/__init__.py
@@ -0,0 +1,53 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+import json
+
+from logHandler import log
+from .base import ImageCaptioner
+
+
+def imageCaptionerFactory(
+	configPath: str,
+	encoderPath: str | None = None,
+	decoderPath: str | None = None,
+	monomericModelPath: str | None = None,
+) -> ImageCaptioner:
+	"""Initialize the image caption generator.
+
+	:param monomericModelPath: Path to a single merged model file.
+	:param encoderPath: Path to the encoder model file.
+	:param decoderPath: Path to the decoder model file.
+	:param configPath: Path to the configuration file.
+	:raises ValueError: If neither a single model nor both encoder and decoder are provided.
+	:raises FileNotFoundError: If config file not found.
+	:raises NotImplementedError: if model architecture is unsupported
+	:raises Exception: If config.json fail to load.
+	:return: instance of ImageCaptioner
+	"""
+	if not monomericModelPath and not (encoderPath and decoderPath):
+		raise ValueError(
+			"You must provide either 'monomericModelPath' or both 'encoderPath' and 'decoderPath'.",
+		)
+
+	try:
+		with open(configPath, "r", encoding="utf-8") as f:
+			config = json.load(f)
+	except FileNotFoundError:
+		raise FileNotFoundError(
+			f"Caption model config file {configPath} not found, "
+			"please download models and config file first!",
+		)
+	except Exception:
+		log.exception("config file not found")
+		raise
+
+	modelArchitecture = config["architectures"][0]
+	if modelArchitecture == "VisionEncoderDecoderModel":
+		from .vitGpt2 import VitGpt2ImageCaptioner
+
+		return VitGpt2ImageCaptioner(encoderPath, decoderPath, configPath)
+	else:
+		raise NotImplementedError("Unsupported model architectures")
diff --git a/source/_localCaptioner/captioner/base.py b/source/_localCaptioner/captioner/base.py
new file mode 100644
index 00000000000..ba7ea116f79
--- /dev/null
+++ b/source/_localCaptioner/captioner/base.py
@@ -0,0 +1,24 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+from abc import ABC, abstractmethod
+
+
+class ImageCaptioner(ABC):
+	"""Abstract interface for image caption generation.
+
+	Supports generate caption for image
+	"""
+
+	@abstractmethod
+	def generateCaption(self, image: str | bytes, maxLength: int | None = None) -> str:
+		"""
+		Generate a caption for the given image.
+
+		:param image: Image file path or binary data.
+		:param maxLength: Optional maximum length for the generated caption.
+		:return: The generated image caption as a string.
+		"""
+		pass
diff --git a/source/_localCaptioner/captioner/vitGpt2.py b/source/_localCaptioner/captioner/vitGpt2.py
new file mode 100644
index 00000000000..47af56c9266
--- /dev/null
+++ b/source/_localCaptioner/captioner/vitGpt2.py
@@ -0,0 +1,382 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+import os
+import json
+import re
+import io
+from functools import lru_cache
+
+import numpy as np
+from PIL import Image
+
+from logHandler import log
+
+from .base import ImageCaptioner
+from ..modelConfig import (
+	_EncoderConfig,
+	_DecoderConfig,
+	_GenerationConfig,
+	_ModelConfig,
+	_PreprocessorConfig,
+	_createConfigFromDict,
+)
+from .. import modelConfig
+
+
+class VitGpt2ImageCaptioner(ImageCaptioner):
+	"""Lightweight ONNX Runtime image captioning model.
+
+	This class provides image captioning functionality using ONNX models
+	without PyTorch dependencies. It uses a Vision Transformer encoder
+	and GPT-2 decoder for generating captions.
+	"""
+
+	def __init__(
+		self,
+		encoderPath: str,
+		decoderPath: str,
+		configPath: str,
+		enableProfiling: bool = False,
+	) -> None:
+		"""Initialize the lightweight ONNX image captioning model.
+
+		:param encoderPath: Path to the ViT encoder ONNX model.
+		:param decoderPath: Path to the GPT-2 decoder ONNX model.
+		:param configPath: Path to the configuration file (required).
+		:param enableProfiling: Whether to enable ONNX Runtime profiling.
+		:raises FileNotFoundError: If config file is not found.
+		:raises Exception: If model initialization fails.
+		"""
+		# Import late to avoid importing numpy at initialization
+		import onnxruntime as ort
+
+		# Load configuration file
+		try:
+			with open(configPath, "r", encoding="utf-8") as f:
+				self.config = json.load(f)
+		except FileNotFoundError:
+			raise FileNotFoundError(
+				f"Caption model config file {configPath} not found, "
+				"please download models and config file first!",
+			)
+		except Exception:
+			raise
+
+		# Load vocabulary from vocab.json in the same directory as config
+		configDir = os.path.dirname(configPath)
+		vocabPath = os.path.join(configDir, "vocab.json")
+		self.vocab = self._loadVocab(vocabPath)
+		self.vocabSize = len(self.vocab)
+
+		preprocessorPath = os.path.join(configDir, "preprocessor_config.json")
+		self.preprocessorConfig = self._loadPreprocessorConfig(preprocessorPath)
+
+		# Load all model parameters from configuration
+		self._loadModelParams()
+
+		# Configure ONNX Runtime session
+		sessionOptions = ort.SessionOptions()
+		if enableProfiling:
+			sessionOptions.enable_profiling = True
+
+		# Load ONNX models
+		try:
+			self.encoderSession = ort.InferenceSession(encoderPath, sess_options=sessionOptions)
+			self.decoderSession = ort.InferenceSession(decoderPath, sess_options=sessionOptions)
+		except (
+			ort.capi.onnxruntime_pybind11_state.InvalidProtobuf,
+			ort.capi.onnxruntime_pybind11_state.NoSuchFile,
+		) as e:
+			raise FileNotFoundError(
+				"model file incomplete"
+				f" Please check whether the file is complete or re-download. Original error: {e}",
+			) from e
+
+		log.debug(
+			f"Loaded ONNX models - Encoder: {os.path.basename(encoderPath)}, Decoder: {os.path.basename(decoderPath)}",
+		)
+		log.debug(f"Loaded config : {os.path.basename(configPath)}")
+		log.debug(f"Loaded vocabulary : {os.path.basename(vocabPath)}")
+		log.debug(
+			f"Model config - Image size: {self.encoderConfig.image_size}, Max length: {self.decoderConfig.max_length}",
+		)
+
+	def _loadModelParams(self) -> None:
+		"""Load all model parameters from configuration file."""
+		# Load encoder configuration
+		encoder_dict = self.config.get("encoder", {})
+		self.encoderConfig = _createConfigFromDict(
+			_EncoderConfig,
+			encoder_dict,
+			modelConfig._DEFAULT_ENCODER_CONFIG,
+		)
+
+		# Load decoder configuration
+		decoder_dict = self.config.get("decoder", {})
+		self.decoderConfig = _createConfigFromDict(
+			_DecoderConfig,
+			decoder_dict,
+			modelConfig._DEFAULT_DECODER_CONFIG,
+		)
+
+		# Load generation configuration
+		generation_dict = self.config.get("generation", {})
+		self.generationConfig = _createConfigFromDict(
+			_GenerationConfig,
+			generation_dict,
+			modelConfig._DEFAULT_GENERATION_CONFIG,
+		)
+
+		# Load main model configuration
+		self.modelConfig = _createConfigFromDict(_ModelConfig, self.config, modelConfig._DEFAULT_MODEL_CONFIG)
+
+	def _loadVocab(self, vocabPath: str) -> dict[int, str]:
+		"""Load vocabulary file.
+
+		:param vocabPath: Path to vocab.json file.
+		:return: Dictionary mapping token IDs to tokens.
+		"""
+		try:
+			with open(vocabPath, "r", encoding="utf-8") as f:
+				vocabData = json.load(f)
+
+			# Convert to id -> token format
+			vocab = {v: k for k, v in vocabData.items()}
+			log.debug(f"Successfully loaded vocabulary with {len(vocab)} tokens")
+			return vocab
+
+		except FileNotFoundError:
+			log.exception(f"vocab.json not found at {vocabPath}")
+			raise
+		except Exception:
+			log.exception(f"Could not load vocabulary from {vocabPath}")
+			raise
+
+	def _loadPreprocessorConfig(self, preprocessorPath: str) -> _PreprocessorConfig:
+		"""Load preprocessor configuration from preprocessor_config.json."""
+		try:
+			with open(preprocessorPath, "r", encoding="utf-8") as f:
+				preprocessor_dict = json.load(f)
+		except FileNotFoundError:
+			log.warning("Preprocessor config not found, using defaults")
+			return modelConfig._DEFAULT_PREPROCESSOR_CONFIG
+		else:
+			return _createConfigFromDict(
+				_PreprocessorConfig,
+				preprocessor_dict,
+				modelConfig._DEFAULT_PREPROCESSOR_CONFIG,
+			)
+
+	def _preprocessImage(self, image: str | bytes) -> np.ndarray:
+		"""Preprocess image for model input using external configuration.
+
+		:param image: Image file path or binary data.
+		:return: Preprocessed image array ready for model input.
+		"""
+		# Load image
+		if isinstance(image, str) and os.path.isfile(image):
+			img = Image.open(image).convert("RGB")
+		else:
+			img = Image.open(io.BytesIO(image)).convert("RGB")
+
+		# Resize image if configured
+		if self.preprocessorConfig.do_resize:
+			target_size = (
+				self.preprocessorConfig.size["width"],
+				self.preprocessorConfig.size["height"],
+			)
+			# Map resample integer to PIL constant
+			resample_map = {
+				0: Image.NEAREST,
+				1: Image.LANCZOS,
+				2: Image.BILINEAR,
+				3: Image.BICUBIC,
+				4: Image.BOX,
+				5: Image.HAMMING,
+			}
+			resample_method = resample_map.get(self.preprocessorConfig.resample, Image.LANCZOS)
+			img = img.resize(target_size, resample_method)
+
+		# Convert to numpy array
+		imgArray = np.array(img).astype(np.float32)
+
+		# Rescale if configured (typically from [0, 255] to [0, 1])
+		if self.preprocessorConfig.do_rescale:
+			imgArray = imgArray * self.preprocessorConfig.rescale_factor
+
+		# Normalize if configured
+		if self.preprocessorConfig.do_normalize:
+			mean = np.array(self.preprocessorConfig.image_mean, dtype=np.float32)
+			std = np.array(self.preprocessorConfig.image_std, dtype=np.float32)
+			imgArray = (imgArray - mean) / std
+
+		# Adjust dimensions: (H, W, C) -> (1, C, H, W)
+		imgArray = np.transpose(imgArray, (2, 0, 1))
+		imgArray = np.expand_dims(imgArray, axis=0)
+
+		return imgArray
+
+	def _encodeImage(self, imageArray: np.ndarray) -> np.ndarray:
+		"""Encode image using ViT encoder.
+
+		:param imageArray: Preprocessed image array.
+		:return: Encoder hidden states.
+		"""
+		# Get encoder input name
+		inputName = self.encoderSession.get_inputs()[0].name
+
+		# Run encoder inference
+		imageArray = imageArray.astype(np.float32)
+		encoderOutputs = self.encoderSession.run(None, {inputName: imageArray})
+
+		# Return last hidden state
+		return encoderOutputs[0]
+
+	def _decodeTokens(self, tokenIds: list[int]) -> str:
+		"""Decode token IDs to text.
+
+		:param tokenIds: List of token IDs.
+		:return: Decoded text string.
+		"""
+		tokens = []
+		for tokenId in tokenIds:
+			if tokenId in self.vocab:
+				token = self.vocab[tokenId]
+				if token not in ["<|endoftext|>", "<|pad|>"]:
+					tokens.append(token)
+
+		# Simple text post-processing
+		# Ġ (Unicode U+0120) is used by GPT-2 and RoBERTa to indicate space at the beginning of a word in their vocabulary
+		text = " ".join(tokens).replace("Ġ", " ")
+
+		# Basic text cleaning
+		text = re.sub(r"\s+", " ", text)  # Merge multiple spaces
+		text = text.strip()
+
+		return text
+
+	def _getDecoderInputNames(self) -> list[str]:
+		"""Get decoder input names for debugging.
+
+		:returns: List of decoder input names.
+		"""
+		return [inp.name for inp in self.decoderSession.get_inputs()]
+
+	def _getDecoderOutputNames(self) -> list[str]:
+		"""Get decoder output names for debugging.
+
+		:return: List of decoder output names.
+		"""
+		return [out.name for out in self.decoderSession.get_outputs()]
+
+	def _initializePastKeyValues(self, batchSize: int = 1) -> dict[str, np.ndarray]:
+		"""Initialize past_key_values for decoder.
+
+		:param batchSize: Batch size for inference.
+		:return: Dictionary of initialized past key values.
+		"""
+		pastKeyValues = {}
+
+		# Create key and value for each layer
+		for layerIdx in range(self.decoderConfig.n_layer):
+			# Key and value shape: (batch_size, num_heads, 0, head_dim)
+			# Initial sequence length is 0
+			headDim = self.decoderConfig.n_embd // self.decoderConfig.n_head
+
+			keyShape = (batchSize, self.decoderConfig.n_head, 0, headDim)
+			valueShape = (batchSize, self.decoderConfig.n_head, 0, headDim)
+
+			pastKeyValues[f"past_key_values.{layerIdx}.key"] = np.zeros(keyShape, dtype=np.float32)
+			pastKeyValues[f"past_key_values.{layerIdx}.value"] = np.zeros(valueShape, dtype=np.float32)
+
+		return pastKeyValues
+
+	def _generateWithGreedy(
+		self,
+		encoderHiddenStates: np.ndarray,
+		maxLength: int | None = None,
+	) -> str:
+		"""Generate text using greedy search.
+
+
+		:param encoderHiddenStates: Encoder hidden states.
+		:param maxLength: Maximum generation length.
+		:return: Generated text string.
+		"""
+		if maxLength is None:
+			maxLength = self.decoderConfig.max_length
+
+		# Initialize input sequence
+		inputIds = np.array([[self.modelConfig.bos_token_id]], dtype=np.int64)
+		generatedTokens = []
+
+		# Initialize past_key_values
+		pastKeyValues = self._initializePastKeyValues(batchSize=1)
+
+		for step in range(maxLength):
+			# Prepare decoder inputs
+			decoderInputs = {
+				"input_ids": inputIds if step == 0 else np.array([[generatedTokens[-1]]], dtype=np.int64),
+				"encoder_hidden_states": encoderHiddenStates,
+				"use_cache_branch": np.array([1], dtype=np.bool_),
+			}
+
+			# Add past_key_values to inputs
+			decoderInputs.update(pastKeyValues)
+
+			# Run decoder
+			decoderOutputs = self.decoderSession.run(None, decoderInputs)
+			logits = decoderOutputs[0]  # Shape: (batch_size, seq_len, vocab_size)
+
+			# Greedy selection of next token
+			nextTokenLogits = logits[0, -1, :]  # Logits for last position
+			nextTokenId = int(np.argmax(nextTokenLogits))
+
+			# Check if generation should end
+			if nextTokenId == self.modelConfig.eos_token_id:
+				break
+
+			generatedTokens.append(nextTokenId)
+
+			# Update past_key_values from outputs
+			if len(decoderOutputs) > 1:
+				for layerIdx in range(self.decoderConfig.n_layer):
+					if len(decoderOutputs) > 1 + layerIdx * 2 + 1:
+						# [3] -> layer1 key, [4] -> layer1 value
+						keyIndex = 1 + layerIdx * 2
+						valueIndex = keyIndex + 1
+						pastKeyValues[f"past_key_values.{layerIdx}.key"] = decoderOutputs[keyIndex]
+						pastKeyValues[f"past_key_values.{layerIdx}.value"] = decoderOutputs[valueIndex]
+
+			# Avoid sequences that are too long
+			if len(generatedTokens) >= self.decoderConfig.n_ctx - 1:
+				break
+
+		# Decode generated text
+		return self._decodeTokens(generatedTokens)
+
+	@lru_cache()
+	def generateCaption(
+		self,
+		image: str | bytes,
+		maxLength: int | None = None,
+	) -> str:
+		"""Generate image caption.
+
+		:param image: Image file path or binary data.
+		:param maxLength: Maximum generation length.
+		:return: Generated image caption.
+		"""
+		# Preprocess image
+		imageArray = self._preprocessImage(image)
+
+		# Encode image
+		encoderHiddenStates = self._encodeImage(imageArray)
+
+		# Generate text
+		caption = self._generateWithGreedy(encoderHiddenStates, maxLength)
+
+		return caption
diff --git a/source/_localCaptioner/imageDescriber.py b/source/_localCaptioner/imageDescriber.py
new file mode 100644
index 00000000000..1e193789ebf
--- /dev/null
+++ b/source/_localCaptioner/imageDescriber.py
@@ -0,0 +1,214 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+"""ImageDescriber module for NVDA.
+
+This module provides local image captioning functionality using ONNX models.
+It allows users to capture screen regions and generate captions using local AI models.
+"""
+
+import io
+import threading
+from threading import Thread
+import os
+
+import wx
+import config
+from logHandler import log
+import ui
+import api
+from keyboardHandler import KeyboardInputGesture
+from NVDAState import WritePaths
+import core
+
+from .captioner import ImageCaptioner
+from .captioner import imageCaptionerFactory
+
+
+# Module-level configuration
+_localCaptioner = None
+
+
+def _screenshotNavigator() -> bytes:
+	"""Capture a screenshot of the current navigator object.
+
+	:Return: The captured image data as bytes in JPEG format.
+	"""
+	# Get the currently focused object on screen
+	obj = api.getNavigatorObject()
+
+	# Get the object's position and size information
+	x, y, width, height = obj.location
+
+	# Create a bitmap with the same size as the object
+	bmp = wx.Bitmap(width, height)
+
+	# Create a memory device context for drawing operations on the bitmap
+	mem = wx.MemoryDC(bmp)
+
+	# Copy the specified screen region to the memory bitmap
+	mem.Blit(0, 0, width, height, wx.ScreenDC(), x, y)
+
+	# Convert the bitmap to an image object for more flexible operations
+	image = bmp.ConvertToImage()
+
+	# Create a byte stream object to save image data as binary data
+	body = io.BytesIO()
+
+	# Save the image to the byte stream in JPEG format
+	image.SaveFile(body, wx.BITMAP_TYPE_JPEG)
+
+	# Read the binary image data from the byte stream
+	imageData = body.getvalue()
+	return imageData
+
+
+def _messageCaption(captioner: ImageCaptioner, imageData: bytes) -> None:
+	"""Generate a caption for the given image data.
+
+	:param captioner: The captioner instance to use for generation.
+	:param imageData: The image data to caption.
+	"""
+	try:
+		description = captioner.generateCaption(image=imageData)
+	except Exception:
+		# Translators: error message when an image description cannot be generated
+		wx.CallAfter(ui.message, pgettext("imageDesc", "Failed to generate description"))
+		log.exception("Failed to generate caption")
+	else:
+		wx.CallAfter(
+			ui.message,
+			# Translators: Presented when an AI image description has been generated.
+			# {description} will be replaced with the generated image description.
+			pgettext("imageDesc", "Could be: {description}").format(description=description),
+		)
+
+
+class ImageDescriber:
+	"""module for local image caption functionality.
+
+	This module provides image captioning using local ONNX models.
+	It can capture screen regions and generate descriptive captions.
+	"""
+
+	def __init__(self) -> None:
+		self.isModelLoaded = False
+		self.captioner: ImageCaptioner | None = None
+		self.captionThread: Thread | None = None
+		self.loadModelThread: Thread | None = None
+
+		enable = config.conf["automatedImageDescriptions"]["enable"]
+		# Load model when initializing (may cause high memory usage)
+		if enable:
+			core.postNvdaStartup.register(self.loadModelInBackground)
+
+	def terminate(self):
+		for t in [self.captionThread, self.loadModelThread]:
+			if t is not None and t.is_alive():
+				t.join()
+
+		self.captioner = None
+
+	def runCaption(self, gesture: KeyboardInputGesture) -> None:
+		"""Script to run image captioning on the current navigator object.
+
+		:param gesture: The input gesture that triggered this script.
+		"""
+		self._doCaption()
+
+	def _doCaption(self) -> None:
+		"""Real logic to run image captioning on the current navigator object."""
+		imageData = _screenshotNavigator()
+
+		if not self.isModelLoaded:
+			from gui._localCaptioner.messageDialogs import openEnableOnceDialog
+
+			# Ask to enable image desc only in this session, No configuration modifications
+			wx.CallAfter(openEnableOnceDialog)
+			return
+
+		if self.captionThread is not None and self.captionThread.is_alive():
+			return
+
+		self.captionThread = threading.Thread(
+			target=_messageCaption,
+			args=(self.captioner, imageData),
+			name="RunCaptionThread",
+		)
+		# Translators: Message when starting image recognition
+		ui.message(pgettext("imageDesc", "getting image description..."))
+		self.captionThread.start()
+
+	def _loadModel(self, localModelDirPath: str | None = None) -> None:
+		"""Load the ONNX model for image captioning.
+
+		:param localModelDirPath: path of model directory
+		"""
+
+		if not localModelDirPath:
+			baseModelsDir = WritePaths.modelsDir
+			localModelDirPath = os.path.join(
+				baseModelsDir,
+				config.conf["automatedImageDescriptions"]["defaultModel"],
+			)
+		encoderPath = f"{localModelDirPath}/onnx/encoder_model_quantized.onnx"
+		decoderPath = f"{localModelDirPath}/onnx/decoder_model_merged_quantized.onnx"
+		configPath = f"{localModelDirPath}/config.json"
+
+		try:
+			self.captioner = imageCaptionerFactory(
+				encoderPath=encoderPath,
+				decoderPath=decoderPath,
+				configPath=configPath,
+			)
+		except FileNotFoundError:
+			self.isModelLoaded = False
+			from gui._localCaptioner.messageDialogs import ImageDescDownloader
+
+			descDownloader = ImageDescDownloader()
+			wx.CallAfter(descDownloader.openDownloadDialog)
+		except Exception:
+			self.isModelLoaded = False
+			# Translators: error message when fail to load model
+			wx.CallAfter(ui.message, pgettext("imageDesc", "failed to load image captioner"))
+			log.exception("Failed to load image captioner model")
+		else:
+			self.isModelLoaded = True
+			# Translators: Message when successfully load the model
+			wx.CallAfter(ui.message, pgettext("imageDesc", "image captioning on"))
+
+	def loadModelInBackground(self, localModelDirPath: str | None = None) -> None:
+		"""load model in child thread
+
+		:param localModelDirPath: path of model directory
+		"""
+		self.loadModelThread = threading.Thread(
+			target=self._loadModel,
+			args=(localModelDirPath,),
+			name="LoadModelThread",
+		)
+		self.loadModelThread.start()
+
+	def _doReleaseModel(self) -> None:
+		if hasattr(self, "captioner") and self.captioner:
+			del self.captioner
+			self.captioner = None
+			# Translators: Message when image captioning terminates
+			ui.message(pgettext("imageDesc", "image captioning off"))
+			self.isModelLoaded = False
+
+	def toggleSwitch(self) -> None:
+		"""do load/unload the model from memory."""
+		if self.isModelLoaded:
+			self._doReleaseModel()
+		else:
+			self.loadModelInBackground()
+
+	def toggleImageCaptioning(self, gesture: KeyboardInputGesture) -> None:
+		"""do load/unload the model from memory.
+
+		:param gesture: gesture to toggle this function
+		"""
+		self.toggleSwitch()
diff --git a/source/_localCaptioner/modelConfig.py b/source/_localCaptioner/modelConfig.py
new file mode 100644
index 00000000000..f5b705e482a
--- /dev/null
+++ b/source/_localCaptioner/modelConfig.py
@@ -0,0 +1,277 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+from dataclasses import dataclass, fields, replace
+from typing import Type
+
+
+@dataclass(frozen=True)
+class _EncoderConfig:
+	"""Configuration for Vision Transformer encoder.
+
+	Based on the Vision Transformer (ViT) specification:
+	https://arxiv.org/abs/2010.11929
+
+	HuggingFace ViT configuration:
+	https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTConfig
+
+	Note: Variable names follow the original specification and HuggingFace conventions
+	rather than lowerCamelCase to maintain compatibility with pretrained models.
+	"""
+
+	image_size: int = 224
+	num_channels: int = 3
+	patch_size: int = 16
+	hidden_size: int = 768
+	num_hidden_layers: int = 12
+	num_attention_heads: int = 12
+	intermediate_size: int = 3072
+	hidden_act: str = "gelu"
+	hidden_dropout_prob: float = 0.0
+	attention_probs_dropout_prob: float = 0.0
+	initializer_range: float = 0.02
+	layer_norm_eps: float = 1e-12
+	encoder_stride: int = 16
+	qkv_bias: bool = True
+	model_type: str = "vit"
+	# Additional fields from HuggingFace config
+	add_cross_attention: bool = False
+	is_decoder: bool = False
+	is_encoder_decoder: bool = False
+	chunk_size_feed_forward: int = 0
+	cross_attention_hidden_size: int | None = None
+	finetuning_task: str | None = None
+	output_attentions: bool = False
+	output_hidden_states: bool = False
+	return_dict: bool = True
+	pruned_heads: dict[str, list[int]] | None = None
+	tie_word_embeddings: bool = True
+	torch_dtype: str | None = None
+	torchscript: bool = False
+	use_bfloat16: bool = False
+
+
+@dataclass(frozen=True)
+class _DecoderConfig:
+	"""Configuration for GPT-2 decoder.
+
+	Based on the GPT-2 specification:
+	https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf
+
+	HuggingFace GPT-2 configuration:
+	https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Config
+
+	Note: Variable names follow the original GPT-2 and HuggingFace conventions
+	rather than lowerCamelCase to maintain compatibility with pretrained models.
+	"""
+
+	vocab_size: int = 50257
+	n_embd: int = 768
+	n_layer: int = 12
+	n_head: int = 12
+	n_ctx: int = 1024
+	n_positions: int = 1024
+	n_inner: int | None = None
+	activation_function: str = "gelu_new"
+	resid_pdrop: float = 0.1
+	embd_pdrop: float = 0.1
+	attn_pdrop: float = 0.1
+	layer_norm_epsilon: float = 1e-05
+	initializer_range: float = 0.02
+	model_type: str = "gpt2"
+	# Generation parameters
+	max_length: int = 20
+	min_length: int = 0
+	do_sample: bool = False
+	early_stopping: bool = False
+	num_beams: int = 1
+	num_beam_groups: int = 1
+	diversity_penalty: float = 0.0
+	temperature: float = 1.0
+	top_k: int = 50
+	top_p: float = 1.0
+	typical_p: float = 1.0
+	repetition_penalty: float = 1.0
+	length_penalty: float = 1.0
+	no_repeat_ngram_size: int = 0
+	encoder_no_repeat_ngram_size: int = 0
+	num_return_sequences: int = 1
+	# Cross attention
+	add_cross_attention: bool = True
+	is_decoder: bool = True
+	is_encoder_decoder: bool = False
+	# Token IDs
+	bos_token_id: int = 50256
+	eos_token_id: int = 50256
+	pad_token_id: int = 50256
+	decoder_start_token_id: int = 50256
+	# Additional configuration
+	chunk_size_feed_forward: int = 0
+	cross_attention_hidden_size: int | None = None
+	bad_words_ids: list[int] | None = None
+	begin_suppress_tokens: list[int] | None = None
+	forced_bos_token_id: int | None = None
+	forced_eos_token_id: int | None = None
+	suppress_tokens: list[int] | None = None
+	exponential_decay_length_penalty: float | None = None
+	remove_invalid_values: bool = False
+	return_dict_in_generate: bool = False
+	output_attentions: bool = False
+	output_hidden_states: bool = False
+	output_scores: bool = False
+	use_cache: bool = True
+	# Labels
+	id2label: dict[str, str] | None = None
+	label2id: dict[str, int] | None = None
+	# Scaling and attention
+	reorder_and_upcast_attn: bool = False
+	scale_attn_by_inverse_layer_idx: bool = False
+	scale_attn_weights: bool = True
+	# Summary configuration
+	summary_activation: str | None = None
+	summary_first_dropout: float = 0.1
+	summary_proj_to_labels: bool = True
+	summary_type: str = "cls_index"
+	summary_use_proj: bool = True
+	# Task specific parameters
+	task_specific_params: dict[str, any] | None = None
+	# Other configurations
+	finetuning_task: str | None = None
+	prefix: str | None = None
+	problem_type: str | None = None
+	pruned_heads: dict[str, list[int]] | None = None
+	sep_token_id: int | None = None
+	tf_legacy_loss: bool = False
+	tie_encoder_decoder: bool = False
+	tie_word_embeddings: bool = True
+	tokenizer_class: str | None = None
+	torch_dtype: str | None = None
+	torchscript: bool = False
+	use_bfloat16: bool = False
+
+
+@dataclass(frozen=True)
+class _GenerationConfig:
+	"""Configuration for text generation parameters.
+
+	Based on HuggingFace GenerationConfig:
+	https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig
+
+	Note: Variable names follow HuggingFace conventions rather than lowerCamelCase
+	to maintain compatibility with the transformers library.
+	"""
+
+	do_sample: bool = False
+	num_beams: int = 1
+	temperature: float = 1.0
+	top_k: int = 50
+	top_p: float = 1.0
+	repetition_penalty: float = 1.0
+	length_penalty: float = 1.0
+	max_length: int = 20
+	min_length: int = 0
+	early_stopping: bool = False
+	diversity_penalty: float = 0.0
+	num_beam_groups: int = 1
+	no_repeat_ngram_size: int = 0
+	num_return_sequences: int = 1
+
+
+@dataclass(frozen=True)
+class _ModelConfig:
+	"""Main model configuration.
+
+	Based on HuggingFace VisionEncoderDecoderConfig:
+	https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder#transformers.VisionEncoderDecoderConfig
+
+	Note: Variable names follow HuggingFace conventions rather than lowerCamelCase
+	to maintain compatibility with pretrained models.
+	"""
+
+	model_type: str = "vision-encoder-decoder"
+	is_encoder_decoder: bool = True
+	tie_word_embeddings: bool = False
+	bos_token_id: int = 50256
+	eos_token_id: int = 50256
+	pad_token_id: int = 50256
+	decoder_start_token_id: int = 50256
+	transformers_version: str = "4.33.0.dev0"
+	architectures: list[str] | None = None
+
+
+@dataclass(frozen=True)
+class _PreprocessorConfig:
+	"""Configuration for image preprocessing.
+
+	Based on HuggingFace ViTFeatureExtractor/ViTImageProcessor:
+	https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTFeatureExtractor
+	https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTImageProcessor
+
+	Note: Variable names follow HuggingFace conventions rather than lowerCamelCase
+	to maintain compatibility with the transformers library.
+	"""
+
+	do_normalize: bool = True
+	do_rescale: bool = True
+	do_resize: bool = True
+	feature_extractor_type: str = "ViTFeatureExtractor"
+	image_processor_type: str = "ViTFeatureExtractor"
+	image_mean: list[float] | None = None
+	image_std: list[float] | None = None
+	resample: int = 2  # PIL.Image.LANCZOS
+	rescale_factor: float = 0.00392156862745098  # 1/255
+	size: dict[str, int] | None = None
+
+	def __post_init__(self):
+		"""Initialize default values for mutable fields."""
+		if self.image_mean is None:
+			object.__setattr__(self, "image_mean", [0.5, 0.5, 0.5])
+		if self.image_std is None:
+			object.__setattr__(self, "image_std", [0.5, 0.5, 0.5])
+		if self.size is None:
+			object.__setattr__(self, "size", {"height": 224, "width": 224})
+
+
+# Default configuration instances
+_DEFAULT_ENCODER_CONFIG: _EncoderConfig | None = None
+_DEFAULT_DECODER_CONFIG: _DecoderConfig | None = None
+_DEFAULT_GENERATION_CONFIG: _GenerationConfig | None = None
+_DEFAULT_MODEL_CONFIG: _ModelConfig | None = None
+_DEFAULT_PREPROCESSOR_CONFIG: _PreprocessorConfig | None = None
+
+
+def initialize():
+	global \
+		_DEFAULT_ENCODER_CONFIG, \
+		_DEFAULT_DECODER_CONFIG, \
+		_DEFAULT_GENERATION_CONFIG, \
+		_DEFAULT_MODEL_CONFIG, \
+		_DEFAULT_PREPROCESSOR_CONFIG
+	_DEFAULT_ENCODER_CONFIG = _EncoderConfig()
+	_DEFAULT_DECODER_CONFIG = _DecoderConfig()
+	_DEFAULT_GENERATION_CONFIG = _GenerationConfig()
+	_DEFAULT_MODEL_CONFIG = _ModelConfig()
+	_DEFAULT_PREPROCESSOR_CONFIG = _PreprocessorConfig()
+
+
+def _createConfigFromDict[T](
+	configClass: Type[T],
+	configdict: dict[str, str | int | float | bool | list | dict | None],
+	defaultConfig: T,
+) -> T:
+	"""Create a dataclass instance from a dictionary with automatic field mapping.
+
+	:param configClass: The dataclass type to create
+	:param configdict: dictionary containing configuration values
+	:param defaultConfig: Default configuration instance
+	:return: New dataclass instance with values from configdict or defaults
+	"""
+	# Get all field names from the dataclass
+	fieldNames = {f.name for f in fields(configClass)}
+
+	# Filter configdict to only include valid field names
+	validUpdates = {fieldName: value for fieldName, value in configdict.items() if fieldName in fieldNames}
+
+	return replace(defaultConfig, **validUpdates)
diff --git a/source/_localCaptioner/modelDownloader.py b/source/_localCaptioner/modelDownloader.py
new file mode 100644
index 00000000000..476b91dd926
--- /dev/null
+++ b/source/_localCaptioner/modelDownloader.py
@@ -0,0 +1,760 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+"""
+Multi‑threaded model downloader
+
+Download ONNX / tokenizer assets from *Hugging Face* (or any HTTP host)
+with progress callbacks. Refactored to use requests library.
+"""
+
+import os
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from collections.abc import Callable
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.exceptions import RequestException
+from requests.models import Response
+from urllib3.util.retry import Retry
+
+from logHandler import log
+import config
+from NVDAState import WritePaths
+
+# Type definitions
+ProgressCallback = Callable[[str, int, int, float], None]
+
+# Constants
+CHUNK_SIZE: int = 8_192
+MAX_RETRIES: int = 3
+BACKOFF_BASE: int = 2  # Base delay (in seconds) for exponential backoff strategy
+
+
+class ModelDownloader:
+	"""Multi-threaded model downloader with progress tracking and retry logic."""
+
+	def __init__(
+		self,
+		remoteHost: str = "huggingface.co",
+		maxWorkers: int = 4,
+		maxRetries: int = MAX_RETRIES,
+	):
+		"""
+		Initialize the ModelDownloader.
+
+		:param remoteHost: Remote host URL (default: huggingface.co).
+		:param maxWorkers: Maximum number of worker threads.
+		:param maxRetries: Maximum retry attempts per file.
+		"""
+		self.remoteHost = remoteHost
+		self.maxWorkers = maxWorkers
+		self.maxRetries = maxRetries
+
+		# Thread control
+		self.cancelRequested = False
+		self.downloadLock = threading.Lock()
+		self.activeFutures = set()
+
+		# Configure requests session with retry strategy and automatic redirects
+		self.session = requests.Session()
+
+		# Configure retry strategy
+		retryStrategy = Retry(
+			# Maximum number of retries before giving up
+			total=maxRetries,
+			# Base factor for calculating delay between retries
+			backoff_factor=BACKOFF_BASE,
+			# HTTP status codes that trigger a retry
+			status_forcelist=[429, 500, 502, 503, 504],
+			# HTTP methods allowed to retry
+			allowed_methods=["HEAD", "GET", "OPTIONS"],
+		)
+
+		adapter = HTTPAdapter(max_retries=retryStrategy)
+		self.session.mount("https://", adapter)
+
+	def requestCancel(self) -> None:
+		"""Request cancellation of all active downloads."""
+		log.debug("Cancellation requested")
+		self.cancelRequested = True
+
+		# Cancel all active futures
+		with self.downloadLock:
+			for future in self.activeFutures:
+				if not future.done():
+					future.cancel()
+			self.activeFutures.clear()
+
+	def resetCancellation(self) -> None:
+		"""Reset cancellation state for new download session."""
+		with self.downloadLock:
+			self.cancelRequested = False
+			self.activeFutures.clear()
+
+	def ensureModelsDirectory(self) -> str:
+		"""
+		Ensure the *models* directory exists (``../../models`` relative to *basePath*).
+
+		:return: Absolute path of the *models* directory.
+		:raises OSError: When the directory cannot be created.
+		"""
+		modelsDir = os.path.abspath(config.conf["automatedImageDescriptions"]["defaultModel"])
+
+		try:
+			Path(modelsDir).mkdir(parents=True, exist_ok=True)
+		except OSError as err:
+			raise OSError(f"Failed to create models directory {modelsDir}: {err}") from err
+		else:
+			log.debug(f"Models directory ensured: {modelsDir}")
+			return modelsDir
+
+	def constructDownloadUrl(
+		self,
+		modelName: str,
+		filePath: str,
+		resolvePath: str = "/resolve/main",
+	) -> str:
+		"""
+		Construct a full download URL for *Hugging Face‑style* repositories.
+
+		:param modelName: Model repository name, e.g. ``Xenova/vit-gpt2-image-captioning``.
+		:param filePath: Path inside the repo.
+		:param resolvePath: The branch / ref path, default ``/resolve/main``.
+		:return: Complete download URL.
+		"""
+		remoteHost = self.remoteHost
+		if not remoteHost.startswith(("http://", "https://")):
+			remoteHost = f"https://{remoteHost}"
+
+		base = remoteHost.rstrip("/")
+		model = modelName.strip("/")
+		ref = resolvePath.strip("/")
+		filePath = filePath.lstrip("/")
+
+		return f"{base}/{model}/{ref}/{filePath}"
+
+	def _getRemoteFileSize(self, url: str) -> int:
+		"""
+		Get remote file size using HEAD request with automatic redirect handling.
+
+		:param url: Remote URL.
+		:return: File size in bytes, 0 if unable to determine.
+		"""
+		if self.cancelRequested:
+			return 0
+
+		try:
+			# Use HEAD request with automatic redirect following
+			response = self.session.head(url, timeout=10, allow_redirects=True)
+			response.raise_for_status()
+		except Exception as e:
+			if not self.cancelRequested:
+				log.warning(f"Failed to get remote file size (HEAD) for {url}: {e}")
+		else:
+			contentLength = response.headers.get("Content-Length")
+			if contentLength:
+				return int(contentLength)
+
+		try:
+			# If HEAD doesn't work, try GET with range header to get just 1 byte
+			response = self.session.get(url, headers={"Range": "bytes=0-0"}, timeout=10, allow_redirects=True)
+		except Exception as e:
+			if not self.cancelRequested:
+				log.warning(f"Failed to get remote file size (GET) for {url}: {e}")
+		else:
+			if response.status_code == 206:  # Partial content
+				contentRange = response.headers.get("Content-Range", "")
+				if contentRange and "/" in contentRange:
+					return int(contentRange.split("/")[-1])
+
+		return 0
+
+	def _reportProgress(
+		self,
+		callback: ProgressCallback | None,
+		fileName: str,
+		downloaded: int,
+		total: int,
+		lastReported: int,
+	) -> int:
+		"""
+		Report download progress if conditions are met.
+
+		:param callback: Progress callback function.
+		:param fileName: Name of the file being downloaded.
+		:param downloaded: Bytes downloaded so far.
+		:param total: Total file size in bytes.
+		:param lastReported: Last reported download amount.
+		:return: New lastReported value.
+		"""
+		if not callback or total == 0 or self.cancelRequested:
+			return lastReported
+
+		percent = downloaded / total * 100
+
+		# Report progress every 1 MiB or 1% or when complete
+		if (
+			downloaded - lastReported >= 1_048_576  # 1 MiB
+			or abs(percent - lastReported / total * 100) >= 1.0
+			or downloaded == total
+		):
+			callback(fileName, downloaded, total, percent)
+			return downloaded
+
+		return lastReported
+
+	def downloadSingleFile(
+		self,
+		url: str,
+		localPath: str,
+		progressCallback: ProgressCallback | None = None,
+	) -> tuple[bool, str]:
+		"""
+		Download a single file with resume support and automatic redirect handling.
+
+		:param url: Remote URL to download from.
+		:param localPath: Local file path to save the downloaded file.
+		:param progressCallback: Optional callback function for progress reporting.
+		:return: Tuple of (success_flag, status_message).
+		:raises OSError: When directory creation fails.
+		:raises requests.exceptions.RequestException: When network request fails.
+		:raises Exception: When unexpected errors occur during download.
+		"""
+		if self.cancelRequested:
+			return False, "Download cancelled"
+
+		threadId = threading.current_thread().ident or 0
+		fileName = os.path.basename(localPath)
+
+		# Create destination directory
+		success, message = self._createDestinationDirectory(localPath)
+		if not success:
+			return False, message
+
+		# Get remote file size with redirect handling
+		remoteSize = self._getRemoteFileSize(url)
+
+		if self.cancelRequested:
+			return False, "Download cancelled"
+
+		# Check if file already exists and is complete
+		success, message = self._checkExistingFile(
+			localPath,
+			remoteSize,
+			fileName,
+			progressCallback,
+			threadId,
+		)
+		if success is not None:
+			return success, message
+
+		# Attempt download with retries
+		return self._downloadWithRetries(url, localPath, fileName, threadId, progressCallback)
+
+	def _createDestinationDirectory(self, localPath: str) -> tuple[bool, str]:
+		"""
+		Create destination directory if it doesn't exist.
+
+		:param localPath: Local file path to create directory for.
+		:return: Tuple of (success_flag, error_message).
+		:raises OSError: When directory creation fails due to permissions or disk space.
+		"""
+		try:
+			Path(os.path.dirname(localPath)).mkdir(parents=True, exist_ok=True)
+			return True, ""
+		except OSError as err:
+			return False, f"Failed to create directory {localPath}: {err}"
+
+	def _checkExistingFile(
+		self,
+		localPath: str,
+		remoteSize: int,
+		fileName: str,
+		progressCallback: ProgressCallback | None,
+		threadId: int,
+	) -> tuple[bool | None, str]:
+		"""
+		Check if file already exists and is complete.
+
+		:param localPath: Local file path to check.
+		:param remoteSize: Size of remote file in bytes.
+		:param fileName: Base name of the file for progress reporting.
+		:param progressCallback: Optional callback function for progress reporting.
+		:param threadId: Current thread identifier for logging.
+		:return: Tuple of (completion_status, status_message). None status means download should continue.
+		:raises OSError: When file operations fail.
+		"""
+		if not os.path.exists(localPath):
+			return None, ""
+
+		localSize = os.path.getsize(localPath)
+		log.debug(f"localSize: {localSize}, remoteSize: {remoteSize}")
+
+		if remoteSize > 0:
+			if localSize == remoteSize:
+				if progressCallback and not self.cancelRequested:
+					progressCallback(fileName, localSize, localSize, 100.0)
+				log.debug(f"File already complete: {localPath}")
+				return True, f"File already complete: {localPath}"
+			elif localSize > remoteSize:
+				# Local file is larger than remote, may be corrupted
+				log.warning(f"Local file larger than remote, removing: {localPath}")
+				try:
+					os.remove(localPath)
+				except OSError:
+					pass
+		else:
+			# Cannot get remote size, assume file exists if non-empty
+			if localSize > 0:
+				if progressCallback and not self.cancelRequested:
+					progressCallback(fileName, localSize, localSize, 100.0)
+				log.debug(f"File already exists (size unknown): {localPath}")
+				return True, f"File already exists: {localPath}"
+
+		return None, ""
+
+	def _downloadWithRetries(
+		self,
+		url: str,
+		localPath: str,
+		fileName: str,
+		threadId: int,
+		progressCallback: ProgressCallback | None,
+	) -> tuple[bool, str]:
+		"""
+		Attempt download with retry logic and exponential backoff.
+
+		:param url: Remote URL to download from.
+		:param localPath: Local file path to save the downloaded file.
+		:param fileName: Base name of the file for progress reporting.
+		:param threadId: Current thread identifier for logging.
+		:param progressCallback: Optional callback function for progress reporting.
+		:return: Tuple of (success_flag, status_message).
+		:raises requests.exceptions.HTTPError: When HTTP request fails.
+		:raises requests.exceptions.RequestException: When network request fails.
+		:raises Exception: When unexpected errors occur.
+		"""
+		for attempt in range(self.maxRetries):
+			if self.cancelRequested:
+				return False, "Download cancelled"
+
+			log.debug(f"Downloading (attempt {attempt + 1}/{self.maxRetries}): {url}")
+
+			try:
+				success, message = self._performSingleDownload(
+					url,
+					localPath,
+					fileName,
+					threadId,
+					progressCallback,
+				)
+
+			except requests.exceptions.HTTPError as e:
+				message = self._handleHttpError(e, localPath, fileName, progressCallback, threadId)
+				if message.startswith("Download completed"):
+					return True, message
+
+			except RequestException as e:
+				if self.cancelRequested:
+					return False, "Download cancelled"
+				message = f"Request error: {str(e)}"
+
+			except Exception as e:
+				if self.cancelRequested:
+					return False, "Download cancelled"
+				message = f"Unexpected error: {str(e)}"
+				log.error(message)
+
+			else:
+				if success:
+					return True, message
+
+			if not self.cancelRequested:
+				log.debug(f"{message} – {url}")
+				if attempt < self.maxRetries - 1:
+					success = self._waitForRetry(attempt, threadId)
+					if not success:
+						return False, "Download cancelled"
+				else:
+					return False, message
+
+		return False, "Maximum retries exceeded"
+
+	def _performSingleDownload(
+		self,
+		url: str,
+		localPath: str,
+		fileName: str,
+		threadId: int,
+		progressCallback: ProgressCallback | None,
+	) -> tuple[bool, str]:
+		"""
+		Perform a single download attempt with resume support.
+
+		:param url: Remote URL to download from.
+		:param localPath: Local file path to save the downloaded file.
+		:param fileName: Base name of the file for progress reporting.
+		:param threadId: Current thread identifier for logging.
+		:param progressCallback: Optional callback function for progress reporting.
+		:return: Tuple of (success_flag, status_message).
+		:raises requests.exceptions.HTTPError: When HTTP request fails.
+		:raises requests.exceptions.RequestException: When network request fails.
+		:raises Exception: When file operations or unexpected errors occur.
+		"""
+		# Check for existing partial file
+		resumePos = self._getResumePosition(localPath, threadId)
+
+		# Get response with resume support
+		response = self._getDownloadResponse(url, resumePos, localPath, threadId)
+
+		if self.cancelRequested:
+			return False, "Download cancelled"
+
+		try:
+			# Determine total file size
+			total = self._calculateTotalSize(response, resumePos)
+
+			if total > 0:
+				log.debug(f"Total file size: {total:,} bytes")
+
+			# Download file content
+			success, message = self._downloadFileContent(
+				response,
+				localPath,
+				fileName,
+				resumePos,
+				total,
+				progressCallback,
+			)
+
+			if not success:
+				return False, message
+
+			# Verify download integrity
+			return self._verifyDownloadIntegrity(localPath, fileName, total, progressCallback, threadId)
+
+		finally:
+			response.close()
+
+	def _getResumePosition(self, localPath: str, threadId: int) -> int:
+		"""
+		Get resume position for partial download.
+
+		:param localPath: Local file path to check.
+		:param threadId: Current thread identifier for logging.
+		:return: Byte position to resume from.
+		:raises OSError: When file operations fail.
+		"""
+		resumePos = 0
+		if os.path.exists(localPath):
+			resumePos = os.path.getsize(localPath)
+			log.debug(f"Resuming from byte {resumePos}")
+		return resumePos
+
+	def _getDownloadResponse(self, url: str, resumePos: int, localPath: str, threadId: int) -> Response:
+		"""
+		Get download response with resume support and redirect handling.
+
+		:param url: Remote URL to download from.
+		:param resumePos: Byte position to resume from.
+		:param localPath: Local file path for cleanup if needed.
+		:param threadId: Current thread identifier for logging.
+		:return: HTTP response object.
+		:raises requests.exceptions.HTTPError: When HTTP request fails.
+		:raises requests.exceptions.RequestException: When network request fails.
+		:raises Exception: When download is cancelled.
+		"""
+		# Set up headers for resume
+		headers = {}
+		if resumePos > 0:
+			headers["Range"] = f"bytes={resumePos}-"
+
+		# Make request with automatic redirect handling
+		response = self.session.get(
+			url,
+			headers=headers,
+			stream=True,
+			timeout=10,
+			allow_redirects=True,
+		)
+
+		# Check if resume is supported
+		if resumePos > 0 and response.status_code != 206:
+			log.debug("Server doesn't support resume, starting from beginning")
+			if os.path.exists(localPath):
+				try:
+					os.remove(localPath)
+				except OSError:
+					pass
+
+			if self.cancelRequested:
+				response.close()
+				raise Exception("Download cancelled")
+
+			# Make new request without range header
+			response.close()
+			response = self.session.get(url, stream=True, timeout=10, allow_redirects=True)
+
+		response.raise_for_status()
+		return response
+
+	def _calculateTotalSize(self, response: Response, resumePos: int) -> int:
+		"""
+		Calculate total file size from HTTP response headers.
+
+		:param response: HTTP response object.
+		:param resumePos: Byte position resumed from.
+		:return: Total file size in bytes.
+		:raises ValueError: When Content-Range header is malformed.
+		"""
+		if response.status_code == 206:
+			# Partial content response
+			contentRange = response.headers.get("Content-Range", "")
+			if contentRange and "/" in contentRange:
+				return int(contentRange.split("/")[-1])
+			else:
+				return int(response.headers.get("Content-Length", "0")) + resumePos
+		else:
+			return int(response.headers.get("Content-Length", "0"))
+
+	def _downloadFileContent(
+		self,
+		response,
+		localPath: str,
+		fileName: str,
+		resumePos: int,
+		total: int,
+		progressCallback: ProgressCallback | None,
+	) -> tuple[bool, str]:
+		"""
+		Download file content with progress reporting and cancellation support.
+
+		:param response: HTTP response object to read from.
+		:param localPath: Local file path to write to.
+		:param fileName: Base name of the file for progress reporting.
+		:param resumePos: Byte position resumed from.
+		:param total: Total file size in bytes.
+		:param progressCallback: Optional callback function for progress reporting.
+		:return: Tuple of (success_flag, error_message).
+		:raises OSError: When file write operations fail.
+		:raises Exception: When download is cancelled or unexpected errors occur.
+		"""
+		downloaded = resumePos
+		lastReported = downloaded
+		mode = "ab" if resumePos > 0 else "wb"
+
+		try:
+			with open(localPath, mode) as fh:
+				for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
+					if self.cancelRequested:
+						return False, "Download cancelled"
+
+					if chunk:  # filter out keep-alive chunks
+						fh.write(chunk)
+						downloaded += len(chunk)
+
+						if total > 0:
+							lastReported = self._reportProgress(
+								progressCallback,
+								fileName,
+								downloaded,
+								total,
+								lastReported,
+							)
+		except Exception as e:
+			return False, f"Failed to write file: {str(e)}"
+
+		return True, ""
+
+	def _verifyDownloadIntegrity(
+		self,
+		localPath: str,
+		fileName: str,
+		total: int,
+		progressCallback: ProgressCallback | None,
+		threadId: int,
+	) -> tuple[bool, str]:
+		"""
+		Verify download integrity and report final progress.
+
+		:param localPath: Local file path to verify.
+		:param fileName: Base name of the file for progress reporting.
+		:param total: Expected total file size in bytes.
+		:param progressCallback: Optional callback function for progress reporting.
+		:param threadId: Current thread identifier for logging.
+		:return: Tuple of (success_flag, status_message).
+		:raises OSError: When file operations fail.
+		"""
+		if self.cancelRequested:
+			return False, "Download cancelled"
+
+		actualSize = os.path.getsize(localPath)
+
+		if actualSize == 0:
+			return False, "Downloaded file is empty"
+
+		if total > 0 and actualSize != total:
+			return False, f"File incomplete: {actualSize}/{total} bytes downloaded"
+
+		# Final progress callback
+		if progressCallback and not self.cancelRequested:
+			progressCallback(fileName, actualSize, max(total, actualSize), 100.0)
+
+		log.debug(f"Successfully downloaded: {localPath}")
+		return True, "Download completed"
+
+	def _handleHttpError(
+		self,
+		error: requests.exceptions.HTTPError,
+		localPath: str,
+		fileName: str,
+		progressCallback: ProgressCallback | None,
+		threadId: int,
+	) -> str:
+		"""
+		Handle HTTP errors with special handling for range not satisfiable.
+
+		:param error: HTTP error exception.
+		:param localPath: Local file path to check for completion.
+		:param fileName: Base name of the file for progress reporting.
+		:param progressCallback: Optional callback function for progress reporting.
+		:param threadId: Current thread identifier for logging.
+		:return: Error message or completion status.
+		:raises OSError: When file operations fail.
+		"""
+		if error.response is not None and error.response.status_code == 416:  # Range Not Satisfiable
+			if os.path.exists(localPath):
+				actualSize = os.path.getsize(localPath)
+				if actualSize > 0:
+					log.debug(f"File appears to be complete: {localPath}")
+					if progressCallback and not self.cancelRequested:
+						progressCallback(fileName, actualSize, actualSize, 100.0)
+					return "Download completed"
+
+		return f"HTTP {error.response.status_code if error.response else 'Error'}: {str(error)}"
+
+	def _waitForRetry(self, attempt: int, threadId: int) -> bool:
+		"""
+		Wait for retry with exponential backoff and cancellation support.
+
+		:param attempt: Current retry attempt number.
+		:param threadId: Current thread identifier for logging.
+		:return: True if wait completed, False if cancelled.
+		"""
+		wait = BACKOFF_BASE**attempt
+		log.debug(f"Waiting {wait}s before retry...")
+
+		for _ in range(wait):
+			if self.cancelRequested:
+				return False
+			time.sleep(1)
+
+		return True
+
+	def downloadModelsMultithreaded(
+		self,
+		modelsDir: str = WritePaths.modelsDir,
+		modelName: str = "Xenova/vit-gpt2-image-captioning",
+		filesToDownload: list[str] | None = None,
+		resolvePath: str = "/resolve/main",
+		progressCallback: ProgressCallback | None = None,
+	) -> tuple[list[str], list[str]]:
+		"""
+		Download multiple model assets concurrently.
+
+		:param modelsDir: Base *models* directory.
+		:param modelName: Repository name.
+		:param filesToDownload: Explicit file list; None uses common defaults.
+		:param resolvePath: Branch / ref path.
+		:param progressCallback: Optional progress callback.
+		:return: (successful_paths, failed_paths) tuple.
+		"""
+		if not self.remoteHost or not modelName:
+			raise ValueError("remoteHost and modelName cannot be empty")
+
+		filesToDownload = filesToDownload or [
+			"onnx/encoder_model_quantized.onnx",
+			"onnx/decoder_model_merged_quantized.onnx",
+			"config.json",
+			"vocab.json",
+			"preprocessor_config.json",
+		]
+
+		if not filesToDownload:
+			raise ValueError("filesToDownload cannot be empty")
+
+		log.debug(
+			f"Starting download of {len(filesToDownload)} files for model: {modelName}\n"
+			f"Remote host: {self.remoteHost}\nMax workers: {self.maxWorkers}",
+		)
+
+		localModelDir = os.path.join(modelsDir, modelName)
+		successful: list[str] = []
+		failed: list[str] = []
+
+		with ThreadPoolExecutor(max_workers=self.maxWorkers) as executor:
+			futures = []
+
+			for path in filesToDownload:
+				if self.cancelRequested:
+					break
+
+				future = executor.submit(
+					self.downloadSingleFile,
+					self.constructDownloadUrl(modelName, path, resolvePath),
+					os.path.join(localModelDir, path),
+					progressCallback,
+				)
+				futures.append((future, path))
+
+				# Track active futures for cancellation
+				with self.downloadLock:
+					self.activeFutures.add(future)
+
+			# Process completed futures
+			for future, filePath in futures:
+				if self.cancelRequested:
+					# Cancel remaining futures but don't wait for them
+					with self.downloadLock:
+						for f, _ in futures:
+							if not f.done():
+								f.cancel()
+					break
+
+				# Remove from active futures tracking
+				with self.downloadLock:
+					self.activeFutures.discard(future)
+
+				try:
+					ok, msg = future.result()
+					if ok:
+						successful.append(filePath)
+						log.debug(f"successful {filePath=}")
+					else:
+						failed.append(filePath)
+						log.debug(f"failed: {filePath} - {msg}")
+				except Exception as err:
+					failed.append(filePath)
+					log.debug(f"failed: {filePath} – {err}")
+
+		# Summary
+		if not self.cancelRequested:
+			log.debug(f"Total: {len(filesToDownload)}")
+			log.debug(f"Successful: {len(successful)}")
+			log.debug(f"Failed: {len(failed)}")
+			log.debug(f"\nLocal model directory: {localModelDir}")
+		else:
+			log.debug("Download cancelled by user")
+
+		return successful, failed
+
+	def __del__(self):
+		"""Clean up the session when the downloader is destroyed."""
+		if hasattr(self, "session"):
+			self.session.close()
diff --git a/source/config/__init__.py b/source/config/__init__.py
index 93c66f28c61..0fbd3cdb98e 100644
--- a/source/config/__init__.py
+++ b/source/config/__init__.py
@@ -416,6 +416,7 @@ class ConfigManager(object):
 		"development",
 		"addonStore",
 		"remote",
+		"automatedImageDescriptions",
 		"math",
 		"screenCurtain",
 	}
diff --git a/source/config/configSpec.py b/source/config/configSpec.py
index da4c436b58d..3696b6fc169 100644
--- a/source/config/configSpec.py
+++ b/source/config/configSpec.py
@@ -97,7 +97,7 @@
 	reportLiveRegions = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="enabled")
 	fontFormattingDisplay = featureFlag(optionsEnum="FontFormattingBrailleModeFlag", behaviorOfDefault="LIBLOUIS")
 	[[auto]]
-		excludedDisplays = string_list(default=list("dotPad"))
+    	excludedDisplays = string_list(default=list("dotPad"))
 
 	# Braille display driver settings
 	[[__many__]]
@@ -373,23 +373,23 @@
 [math]
 	[[speech]]
 		# LearningDisability, Blindness, LowVision
-		impairment = string(default="Blindness")
+    	impairment = string(default="Blindness")
 		# any known language code and sub-code -- could be en-uk, etc
-		language = string(default="Auto")
+    	language = string(default="Auto")
 		# Any known speech style (falls back to ClearSpeak)
-		speechStyle = string(default="ClearSpeak")
+    	speechStyle = string(default="ClearSpeak")
 		# Terse, Medium, Verbose
-		verbosity = string(default="Medium")
+    	verbosity = string(default="Medium")
 		# Change from text speech rate (%)
-		mathRate = integer(default=100)
+    	mathRate = integer(default=100)
 		# Change from normal pause length (%)
-		pauseFactor = integer(default=100)
+    	pauseFactor = integer(default=100)
 		# make a sound when starting/ending math speech -- None, Beep
-		speechSound = string(default="None")
+    	speechSound = string(default="None")
 		# NOTE: not currently working in MathCAT
-		subjectArea = string(default="General")
+    	subjectArea = string(default="General")
 		# SpellOut (H 2 0), AsCompound (Water) -- not implemented, Off (H sub 2 O)
-		chemistry = string(default="SpellOut")
+    	chemistry = string(default="SpellOut")
 		# Verbose, Brief, SuperBrief
 		mathSpeak = string(default="Verbose")
 
@@ -529,6 +529,10 @@
 		# Auto, '.', ',', Custom
 		decimalSeparator = string(default="Auto")
 
+[automatedImageDescriptions]
+	enable = boolean(default=false)
+	defaultModel = string(default="Xenova/vit-gpt2-image-captioning")
+
 [screenCurtain]
 	enabled = boolean(default=false)
 	warnOnLoad = boolean(default=true)
diff --git a/source/core.py b/source/core.py
index a6ff433c74f..113b88ac5bd 100644
--- a/source/core.py
+++ b/source/core.py
@@ -555,6 +555,7 @@ def _handleNVDAModuleCleanupBeforeGUIExit():
 	import globalPluginHandler
 	import watchdog
 	import _remoteClient
+	import _localCaptioner
 
 	try:
 		import updateCheck
@@ -573,6 +574,8 @@ def _handleNVDAModuleCleanupBeforeGUIExit():
 	# Terminating remoteClient causes it to clean up its menus, so do it here while they still exist
 	_terminate(_remoteClient)
 
+	_terminate(_localCaptioner)
+
 
 def _initializeObjectCaches():
 	"""
@@ -916,6 +919,10 @@ def main():
 
 	_remoteClient.initialize()
 
+	import _localCaptioner
+
+	_localCaptioner.initialize()
+
 	if globalVars.appArgs.install or globalVars.appArgs.installSilent:
 		import gui.installerGui
 
diff --git a/source/globalCommands.py b/source/globalCommands.py
index 92d009eb518..25984a1ba35 100755
--- a/source/globalCommands.py
+++ b/source/globalCommands.py
@@ -72,6 +72,7 @@
 import synthDriverHandler
 from utils.displayString import DisplayStringEnum
 import _remoteClient
+import _localCaptioner
 
 #: Script category for text review commands.
 # Translators: The name of a category of NVDA commands.
@@ -124,6 +125,9 @@
 #: Script category for Remote Access commands.
 # Translators: The name of a category of NVDA commands.
 SCRCAT_REMOTE = pgettext("remote", "Remote Access")
+#: Script category for image description commands.
+# Translators: The name of a category of NVDA commands.
+SCRCAT_IMAGE_DESC = pgettext("imageDesc", "Image Descriptions")
 
 # Translators: Reported when there are no settings to configure in synth settings ring
 # (example: when there is no setting for language).
@@ -3517,6 +3521,15 @@ def script_activateDocumentFormattingDialog(self, gesture):
 	def script_activateRemoteAccessSettings(self, gesture: "inputCore.InputGesture"):
 		wx.CallAfter(gui.mainFrame.onRemoteAccessSettingsCommand, None)
 
+	@script(
+		# Translators: Input help mode message for go to local captioner settings command.
+		description=pgettext("imageDesc", "Shows the AI image descriptions settings"),
+		category=SCRCAT_CONFIG,
+	)
+	@gui.blockAction.when(gui.blockAction.Context.MODAL_DIALOG_OPEN)
+	def script_activateLocalCaptionerSettings(self, gesture: "inputCore.InputGesture"):
+		wx.CallAfter(gui.mainFrame.onLocalCaptionerSettingsCommand, None)
+
 	@script(
 		# Translators: Input help mode message for go to Add-on Store settings command.
 		description=_("Shows NVDA's Add-on Store settings"),
@@ -5143,6 +5156,30 @@ def script_repeatLastSpokenInformation(self, gesture: "inputCore.InputGesture")
 			title = _("Last spoken information")
 			ui.browseableMessage(lastSpeechText, title, copyButton=True, closeButton=True)
 
+	@script(
+		description=pgettext(
+			"imageDesc",
+			# Translators: Description for the image caption script
+			"Get an AI-generated image description of the navigator object.",
+		),
+		category=SCRCAT_IMAGE_DESC,
+		gesture="kb:NVDA+g",
+	)
+	@gui.blockAction.when(gui.blockAction.Context.SCREEN_CURTAIN)
+	def script_runCaption(self, gesture: "inputCore.InputGesture"):
+		_localCaptioner._localCaptioner.runCaption(gesture)
+
+	@script(
+		description=pgettext(
+			"imageDesc",
+			# Translators: Description for the toggle image captioning script
+			"Load or unload the image captioner",
+		),
+		category=SCRCAT_IMAGE_DESC,
+	)
+	def script_toggleImageCaptioning(self, gesture: "inputCore.InputGesture"):
+		_localCaptioner._localCaptioner.toggleImageCaptioning(gesture)
+
 
 #: The single global commands instance.
 #: @type: L{GlobalCommands}
diff --git a/source/gui/__init__.py b/source/gui/__init__.py
index 023e787e177..5391ccce5d3 100644
--- a/source/gui/__init__.py
+++ b/source/gui/__init__.py
@@ -56,6 +56,7 @@
 	GeneralSettingsPanel,
 	InputCompositionPanel,
 	KeyboardSettingsPanel,
+	LocalCaptionerSettingsPanel,
 	MouseSettingsPanel,
 	MultiCategorySettingsDialog,
 	NVDASettingsDialog,
@@ -387,6 +388,10 @@ def onUwpOcrCommand(self, evt):
 	def onRemoteAccessSettingsCommand(self, evt):
 		self.popupSettingsDialog(NVDASettingsDialog, RemoteSettingsPanel)
 
+	@blockAction.when(blockAction.Context.SECURE_MODE)
+	def onLocalCaptionerSettingsCommand(self, evt):
+		self.popupSettingsDialog(NVDASettingsDialog, LocalCaptionerSettingsPanel)
+
 	@blockAction.when(blockAction.Context.SECURE_MODE)
 	def onAdvancedSettingsCommand(self, evt: wx.CommandEvent):
 		self.popupSettingsDialog(NVDASettingsDialog, AdvancedPanel)
diff --git a/source/gui/_localCaptioner/__init__.py b/source/gui/_localCaptioner/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/source/gui/_localCaptioner/messageDialogs.py b/source/gui/_localCaptioner/messageDialogs.py
new file mode 100644
index 00000000000..c7a3e7c32cd
--- /dev/null
+++ b/source/gui/_localCaptioner/messageDialogs.py
@@ -0,0 +1,198 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+
+from gui.message import MessageDialog, DefaultButton, ReturnCode, DialogType
+import gui
+from _localCaptioner.modelDownloader import ModelDownloader, ProgressCallback
+import threading
+from threading import Thread
+import wx
+import ui
+import _localCaptioner
+
+
+class ImageDescDownloader:
+	_downloadThread: Thread | None = None
+	isOpening: bool = False
+
+	def __init__(self):
+		self.downloadDict: dict[str, tuple[int, int]] = {}
+		self.modelDownloader: ModelDownloader | None = None
+		self._shouldCancel = False
+		self._progressDialog: wx.ProgressDialog | None = None
+		self.filesToDownload = [
+			"onnx/encoder_model_quantized.onnx",
+			"onnx/decoder_model_merged_quantized.onnx",
+			"config.json",
+			"vocab.json",
+			"preprocessor_config.json",
+		]
+
+	def onDownload(self, progressCallback: ProgressCallback) -> None:
+		self.modelDownloader = ModelDownloader()
+		(success, fail) = self.modelDownloader.downloadModelsMultithreaded(
+			filesToDownload=self.filesToDownload,
+			progressCallback=progressCallback,
+		)
+		if len(fail) == 0:
+			wx.CallAfter(self.openSuccessDialog)
+		else:
+			wx.CallAfter(self.openFailDialog)
+
+	def openSuccessDialog(self) -> None:
+		confirmationButton = (DefaultButton.OK.value._replace(defaultFocus=True, fallbackAction=True),)
+		self._stopped()
+
+		dialog = MessageDialog(
+			parent=None,
+			# Translators: title of dialog when download successfully
+			title=pgettext("imageDesc", "Download successful"),
+			message=pgettext(
+				"imageDesc",
+				# Translators: label of dialog when downloading image captioning
+				"Image captioning installed successfully.",
+			),
+			dialogType=DialogType.STANDARD,
+			buttons=confirmationButton,
+		)
+
+		if dialog.ShowModal() == ReturnCode.OK:
+			# load image desc after successful download
+			if not _localCaptioner.isModelLoaded():
+				_localCaptioner.toggleImageCaptioning()
+
+	def openFailDialog(self) -> None:
+		if self._shouldCancel:
+			return
+
+		confirmationButtons = (
+			DefaultButton.YES.value._replace(defaultFocus=True, fallbackAction=False),
+			DefaultButton.NO.value._replace(defaultFocus=False, fallbackAction=True),
+		)
+
+		dialog = MessageDialog(
+			parent=None,
+			# Translators: title of dialog when fail to download
+			title=pgettext("imageDesc", "Download failed"),
+			message=pgettext(
+				"imageDesc",
+				# Translators: label of dialog when fail to download image captioning
+				"Image captioning download failed. Would you like to retry?",
+			),
+			dialogType=DialogType.WARNING,
+			buttons=confirmationButtons,
+		)
+
+		if dialog.ShowModal() == ReturnCode.YES:
+			self.doDownload()
+		else:
+			self._stopped()
+
+	def openDownloadDialog(self) -> None:
+		if ImageDescDownloader._downloadThread is not None and ImageDescDownloader._downloadThread.is_alive():
+			# Translators: message when image captioning is still downloading
+			ui.message(pgettext("imageDesc", "image captioning is still downloading, please wait..."))
+			return
+		if ImageDescDownloader.isOpening:
+			return
+
+		confirmationButtons = (
+			DefaultButton.YES.value._replace(defaultFocus=True, fallbackAction=False),
+			DefaultButton.NO.value._replace(defaultFocus=False, fallbackAction=True),
+		)
+
+		dialog = MessageDialog(
+			parent=None,
+			# Translators: title of dialog when downloading Image captioning
+			title=pgettext("imageDesc", "Confirm download"),
+			message=pgettext(
+				"imageDesc",
+				# Translators: label of dialog when downloading image captioning
+				"Image captioning not installed. Would you like to install (235 MB)?",
+			),
+			dialogType=DialogType.WARNING,
+			buttons=confirmationButtons,
+		)
+		ImageDescDownloader.isOpening = True
+
+		if dialog.ShowModal() == ReturnCode.YES:
+			self._progressDialog = wx.ProgressDialog(
+				# Translators: The title of the dialog displayed while downloading image descriptioner.
+				pgettext("imageDesc", "Downloading Image Descriptioner"),
+				# Translators: The progress message indicating that a connection is being established.
+				pgettext("imageDesc", "Connecting"),
+				style=wx.PD_CAN_ABORT | wx.PD_ELAPSED_TIME | wx.PD_REMAINING_TIME | wx.PD_AUTO_HIDE,
+				parent=gui.mainFrame,
+			)
+			self.doDownload()
+		else:
+			ImageDescDownloader.isOpening = False
+
+	def doDownload(self):
+		def progressCallback(
+			fileName: str,
+			downloadedBytes: int,
+			totalBytes: int,
+			_percentage: float,
+		) -> None:
+			"""Callback function to capture progress data."""
+			self.downloadDict[fileName] = (downloadedBytes, totalBytes)
+			downloadedSum = sum(d for d, _ in self.downloadDict.values())
+			totalSum = sum(t for _, t in self.downloadDict.values())
+			ratio = downloadedSum / totalSum if totalSum > 0 else 0.0
+			totalProgress = int(ratio * 100)
+			# update progress when downloading all files to prevent premature stop
+			if len(self.downloadDict) == len(self.filesToDownload):
+				# Translators: The progress message indicating that a download is in progress.
+				cont, skip = self._progressDialog.Update(totalProgress, pgettext("imageDesc", "downloading"))
+				if not cont:
+					self._shouldCancel = True
+					self._stopped()
+
+		ImageDescDownloader._downloadThread = threading.Thread(
+			target=self.onDownload,
+			name="ModelDownloadMainThread",
+			daemon=False,
+			args=(progressCallback,),
+		)
+		ImageDescDownloader._downloadThread.start()
+
+	def _stopped(self):
+		self.modelDownloader.requestCancel()
+		ImageDescDownloader._downloadThread = None
+		self._progressDialog.Hide()
+		self._progressDialog.Destroy()
+		self._progressDialog = None
+		ImageDescDownloader.isOpening = False
+
+
+def openEnableOnceDialog() -> None:
+	confirmationButtons = (
+		DefaultButton.YES.value._replace(defaultFocus=True, fallbackAction=False),
+		DefaultButton.NO.value._replace(defaultFocus=False, fallbackAction=True),
+	)
+
+	dialog = MessageDialog(
+		parent=None,
+		# Translators: title of dialog when enable image desc
+		title=pgettext("imageDesc", "Enable AI image descriptions"),
+		message=pgettext(
+			"imageDesc",
+			# Translators: label of dialog when enable image desc
+			"AI image descriptions are currently disabled."
+			"\n\n"
+			"Warning: AI image descriptions are experimental. "
+			"Do not use this feature in circumstances where inaccurate descriptions could cause harm."
+			"\n\n"
+			"Would you like to temporarily enable AI image descriptions now?",
+		),
+		dialogType=DialogType.STANDARD,
+		buttons=confirmationButtons,
+	)
+
+	if dialog.ShowModal() == ReturnCode.YES:
+		# load image desc in this session
+		if not _localCaptioner.isModelLoaded():
+			_localCaptioner.toggleImageCaptioning()
diff --git a/source/gui/blockAction.py b/source/gui/blockAction.py
index 008ade69abf..21fbe4ab8f3 100644
--- a/source/gui/blockAction.py
+++ b/source/gui/blockAction.py
@@ -39,6 +39,14 @@ def _isRemoteAccessDisabled() -> bool:
 	return not remoteRunning()
 
 
+def _isScreenCurtainEnabled() -> bool:
+	"""Whether screen curtain functionality is **enabled**."""
+	# Import late to avoid circular import
+	from screenCurtain import screenCurtain
+
+	return screenCurtain is not None and screenCurtain.enabled
+
+
 @dataclass
 class _Context:
 	blockActionIf: Callable[[], bool]
@@ -86,6 +94,11 @@ class Context(_Context, Enum):
 		# Translators: Reported when an action cannot be performed because Remote Access functionality is disabled.
 		pgettext("remote", "Action unavailable when Remote Access is disabled"),
 	)
+	SCREEN_CURTAIN = (
+		lambda: _isScreenCurtainEnabled(),
+		# Translators: Reported when an action cannot be performed because screen curtain is enabled.
+		_("Action unavailable while screen curtain is enabled"),
+	)
 
 
 def when(*contexts: Context):
diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py
index 2370134380a..507504d9ef5 100644
--- a/source/gui/settingsDialogs.py
+++ b/source/gui/settingsDialogs.py
@@ -4027,6 +4027,53 @@ def onSave(self):
 				_remoteClient.terminate()
 
 
+class LocalCaptionerSettingsPanel(SettingsPanel):
+	"""Settings panel for Local captioner configuration."""
+
+	# Translators: This is the label for the local captioner settings panel.
+	title = pgettext("imageDesc", "AI Image Descriptions")
+	helpId = "LocalCaptionerSettings"
+	panelDescription = pgettext(
+		"imageDesc",
+		# Translators: This is a label appearing on the AI Image Descriptions settings panel.
+		"Warning: AI image descriptions are experimental. "
+		"Do not use this feature in circumstances where inaccurate descriptions could cause harm.",
+	)
+
+	def makeSettings(self, settingsSizer: wx.BoxSizer):
+		"""Create the settings controls for the panel.
+
+		:param settingsSizer: The sizer to add settings controls to.
+		"""
+
+		sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer)
+
+		self.windowText = sHelper.addItem(
+			wx.StaticText(self, label=self.panelDescription),
+		)
+		self.windowText.Wrap(self.scaleSize(PANEL_DESCRIPTION_WIDTH))
+
+		self.enable = sHelper.addItem(
+			# Translators: A configuration in settings dialog.
+			wx.CheckBox(self, label=pgettext("imageDesc", "Enable image captioner")),
+		)
+		self.enable.SetValue(config.conf["automatedImageDescriptions"]["enable"])
+		self.bindHelpEvent("LocalCaptionToggle", self.enable)
+
+	def onSave(self) -> None:
+		"""Save the configuration settings."""
+		enabled = self.enable.GetValue()
+		oldEnabled = config.conf["automatedImageDescriptions"]["enable"]
+
+		if enabled != oldEnabled:
+			import _localCaptioner
+
+			if enabled != _localCaptioner.isModelLoaded():
+				_localCaptioner.toggleImageCaptioning()
+
+		config.conf["automatedImageDescriptions"]["enable"] = enabled
+
+
 class TouchInteractionPanel(SettingsPanel):
 	# Translators: This is the label for the touch interaction settings panel.
 	title = _("Touch Interaction")
@@ -6089,6 +6136,7 @@ class NVDASettingsDialog(MultiCategorySettingsDialog):
 		DocumentNavigationPanel,
 		MathSettingsPanel,
 		RemoteSettingsPanel,
+		LocalCaptionerSettingsPanel,
 	]
 	# In secure mode, add-on update is disabled, so AddonStorePanel should not appear since it only contains
 	# add-on update related controls.
@@ -6117,6 +6165,7 @@ def _doOnCategoryChange(self):
 			or isinstance(self.currentCategory, GeneralSettingsPanel)
 			or isinstance(self.currentCategory, AddonStorePanel)
 			or isinstance(self.currentCategory, RemoteSettingsPanel)
+			or isinstance(self.currentCategory, LocalCaptionerSettingsPanel)
 			or isinstance(self.currentCategory, MathSettingsPanel)
 			or isinstance(self.currentCategory, PrivacyAndSecuritySettingsPanel)
 		):
diff --git a/source/setup.py b/source/setup.py
index d7cb15b0029..bba5f0dbcc3 100755
--- a/source/setup.py
+++ b/source/setup.py
@@ -213,8 +213,6 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]:
 			# winxptheme is optionally used by wx.lib.agw.aui.
 			# We don't need this.
 			"winxptheme",
-			# numpy is an optional dependency of comtypes but we don't require it.
-			"numpy",
 			# multiprocessing isn't going to work in a frozen environment
 			"multiprocessing",
 			"concurrent.futures.process",
@@ -246,6 +244,8 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]:
 			"mdx_truly_sane_lists",
 			"mdx_gh_links",
 			"pymdownx",
+			# Required for local image captioning
+			"numpy",
 		],
 		"includes": [
 			"nvdaBuiltin",
@@ -253,6 +253,9 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]:
 			"bisect",
 			# robotremoteserver (for system tests) depends on xmlrpc.server
 			"xmlrpc.server",
+			# required for import numpy without error
+			"numpy._core._exceptions",
+			"numpy._core._multiarray_umath",
 		],
 	},
 	data_files=[
diff --git a/tests/system/libraries/SystemTestSpy/configManager.py b/tests/system/libraries/SystemTestSpy/configManager.py
index fda1830f67d..ae9286af9ea 100644
--- a/tests/system/libraries/SystemTestSpy/configManager.py
+++ b/tests/system/libraries/SystemTestSpy/configManager.py
@@ -105,6 +105,9 @@ def setupProfile(
 		_pJoin(repoRoot, "tests", "system", "nvdaSettingsFiles", settingsFileName),
 		_pJoin(stagingDir, "nvdaProfile", "nvda.ini"),
 	)
+	if _shouldGenerateMockModel(_pJoin(stagingDir, "nvdaProfile", "nvda.ini")):
+		_configModels(_pJoin(stagingDir, "nvdaProfile", "models", "mock", "vit-gpt2-image-captioning"))
+
 	if gesturesFileName is not None:
 		opSys.copy_file(
 			# Despite duplication, specify full paths for clarity.
@@ -128,3 +131,26 @@ def teardownProfile(stagingDir: str):
 		_pJoin(stagingDir, "nvdaProfile"),
 		recursive=True,
 	)
+
+
+def _configModels(modelsDirectory: str) -> None:
+	from .mockModels import MockVisionEncoderDecoderGenerator
+
+	generator = MockVisionEncoderDecoderGenerator(randomSeed=8)
+	generator.generateAllFiles(modelsDirectory)
+
+
+def _shouldGenerateMockModel(iniPath: str) -> bool:
+	# Read original lines
+	with open(iniPath, "r", encoding="utf-8") as f:
+		lines = f.readlines()
+
+	for line in lines:
+		# Detect section headers
+		stripLine = line.strip()
+		if stripLine.startswith("[") and stripLine.endswith("]"):
+			hasCaptionSection = stripLine.lower() == "[automatedimagedescriptions]"
+			if hasCaptionSection:
+				return True
+			else:
+				continue
diff --git a/tests/system/libraries/SystemTestSpy/mockModels.py b/tests/system/libraries/SystemTestSpy/mockModels.py
new file mode 100644
index 00000000000..896b3901fb2
--- /dev/null
+++ b/tests/system/libraries/SystemTestSpy/mockModels.py
@@ -0,0 +1,793 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later.
+# For more details see: https://www.gnu.org/licenses/gpl-2.0.html
+"""
+Mock Vision-Encoder-Decoder Model Generator
+
+This module provides a class to generate mock ONNX models and configuration files
+for a Vision-Encoder-Decoder model (ViT-GPT2 style) used for image captioning.
+The generated files can be used for testing and development purposes.
+"""
+
+import os
+import json
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+import onnx
+from onnx import helper, TensorProto, numpy_helper
+
+
+class MockVisionEncoderDecoderGenerator:
+	"""
+	A class to generate mock ONNX models and configuration files for a
+	Vision-Encoder-Decoder model architecture.
+
+	This generator creates:
+	- onnx/encoder_model_quantized.onnx: Vision Transformer encoder
+	- onnx/decoder_model_merged_quantized.onnx: GPT-2 style decoder
+	- config.json: Model configuration
+	- vocab.json: Vocabulary mapping
+	"""
+
+	def __init__(self, randomSeed: int = 8):
+		"""
+		Initialize the mock model generator.
+
+		:param randomSeed (int): Random seed for reproducible weight generation.Defaults to 8.
+		"""
+		self.randomSeed = randomSeed
+		self._setRandomSeed()
+
+		# Model hyperparameters
+		self.vocab_size = 100
+		self.hidden_size = 64
+		self.n_layers = 12
+		self.image_size = 224
+		self.patch_size = 16
+		self.num_channels = 3
+
+		# Derived parameters
+		self.num_patches = (self.image_size // self.patch_size) ** 2
+
+	def _setRandomSeed(self) -> None:
+		"""Set random seed for reproducible results."""
+		np.random.seed(self.randomSeed)
+
+	def generateAllFiles(self, outputDir: str) -> None:
+		"""
+		Generate all mock model files in the specified directory.
+
+		:param outputDir (str): Target directory to create the model files. Will create the directory if it doesn't exist.
+		"""
+		outputPath = Path(outputDir)
+		outputPath.mkdir(parents=True, exist_ok=True)
+
+		# Create onnx subdirectory
+		onnxDir = outputPath / "onnx"
+		onnxDir.mkdir(exist_ok=True)
+
+		# Generate all components
+		self._generateEncoderModel(os.path.join(onnxDir, "encoder_model_quantized.onnx"))
+		self._generateDecoderModel(os.path.join(onnxDir, "decoder_model_merged_quantized.onnx"))
+		self._generateConfigFile(os.path.join(outputPath, "config.json"))
+		self._generateVocabFile(os.path.join(outputPath, "vocab.json"))
+
+	def _generateEncoderModel(self, outputPath: Path) -> None:
+		"""
+		Generate the Vision Transformer encoder ONNX model.
+
+		This creates a simplified ViT encoder that performs patch embedding
+		using convolution followed by reshaping operations.
+
+		:param outputPath (Path): Output path for the encoder ONNX file.
+		"""
+		# Define input and output specifications
+		pixelValues = helper.make_tensor_value_info(
+			"pixelValues",
+			TensorProto.FLOAT,
+			["batch", self.num_channels, self.image_size, self.image_size],
+		)
+
+		patchEmbeds = helper.make_tensor_value_info(
+			"patchEmbeds",
+			TensorProto.FLOAT,
+			["batch", self.num_patches, self.hidden_size],
+		)
+
+		# Generate random but reproducible weights for patch embedding
+		convWeights = np.random.randn(
+			self.hidden_size,
+			self.num_channels,
+			self.patch_size,
+			self.patch_size,
+		).astype(np.float32)
+
+		convBias = np.zeros(self.hidden_size, dtype=np.float32)
+
+		# Create initializers
+		weightInit = numpy_helper.from_array(convWeights, "convWeights")
+		biasInit = numpy_helper.from_array(convBias, "convBias")
+
+		# Shape constant for reshaping
+		targetShape = np.array([0, self.num_patches, self.hidden_size], dtype=np.int64)
+		shapeInit = numpy_helper.from_array(targetShape, "targetShape")
+
+		# Define computation nodes
+		nodes = [
+			# Patch embedding using convolution
+			helper.make_node(
+				"Conv",
+				inputs=["pixelValues", "convWeights", "convBias"],
+				outputs=["conv_output"],
+				kernel_shape=[self.patch_size, self.patch_size],
+				strides=[self.patch_size, self.patch_size],
+			),
+			# Transpose to get correct dimension order
+			# From [batch, hidden_size, patch_h, patch_w] to [batch, patch_h, patch_w, hidden_size]
+			helper.make_node(
+				"Transpose",
+				inputs=["conv_output"],
+				outputs=["transposed_output"],
+				perm=[0, 2, 3, 1],
+			),
+			# Reshape to flatten patches
+			# From [batch, patch_h, patch_w, hidden_size] to [batch, num_patches, hidden_size]
+			helper.make_node(
+				"Reshape",
+				inputs=["transposed_output", "targetShape"],
+				outputs=["patchEmbeds"],
+			),
+		]
+
+		# Create and save the model
+		graph = helper.make_graph(
+			nodes=nodes,
+			name="VisionTransformerEncoder",
+			inputs=[pixelValues],
+			outputs=[patchEmbeds],
+			initializer=[weightInit, biasInit, shapeInit],
+		)
+
+		model = helper.make_model(graph, producer_name="mock-vit-encoder")
+		model.opset_import[0].version = 13
+		model.ir_version = 10
+
+		onnx.save(model, str(outputPath))
+
+	def _generateDecoderModel(self, outputPath: Path) -> None:
+		"""
+		Generate the GPT-2 style decoder ONNX model.
+
+		This creates a simplified decoder that accepts multiple inputs including
+		token IDs, encoder hidden states, cache flags, and past key-value pairs.
+
+		:param outputPath (Path): Output path for the decoder ONNX file.
+		"""
+		# Generate fixed random weights for reproducibility
+		embeddingWeights = np.random.randn(
+			self.vocab_size,
+			self.hidden_size,
+		).astype(np.float32)
+
+		projectionWeights = np.random.randn(
+			self.hidden_size,
+			self.vocab_size,
+		).astype(np.float32)
+
+		# Create weight initializers
+		embInit = numpy_helper.from_array(embeddingWeights, "embeddingWeights")
+		projInit = numpy_helper.from_array(projectionWeights, "projectionWeights")
+
+		# Define all input specifications
+		inputs = self._createDecoderInputs()
+
+		# Define output specification
+		outputs = [
+			helper.make_tensor_value_info(
+				"logits",
+				TensorProto.FLOAT,
+				["batch", "seq", self.vocab_size],
+			),
+		]
+
+		# Create computation nodes
+		nodes = self._createDecoderNodes()
+
+		# Create shape and scaling constants
+		shapeConstants = self._createDecoderConstants()
+
+		# Combine all initializers
+		initializers = [embInit, projInit] + shapeConstants
+
+		# Create and save the model
+		graph = helper.make_graph(
+			nodes=nodes,
+			name="GPT2DecoderWithCache",
+			inputs=inputs,
+			outputs=outputs,
+			initializer=initializers,
+		)
+
+		model = helper.make_model(graph, producer_name="mock-gpt2-decoder")
+		model.opset_import[0].version = 13
+		model.ir_version = 10
+
+		onnx.save(model, str(outputPath))
+
+	def _createDecoderInputs(self) -> list:
+		"""
+		Create input specifications for the decoder model.
+
+		:return: list: List of tensor value info objects for all decoder inputs.
+		"""
+		inputs = []
+
+		# Primary inputs
+		inputs.extend(
+			[
+				helper.make_tensor_value_info(
+					"input_ids",
+					TensorProto.INT64,
+					["batch", "seq"],
+				),
+				helper.make_tensor_value_info(
+					"encoder_hidden_states",
+					TensorProto.FLOAT,
+					["batch", "enc_seq_len", self.hidden_size],
+				),
+				helper.make_tensor_value_info(
+					"use_cache_branch",
+					TensorProto.BOOL,
+					["batch"],
+				),
+			],
+		)
+
+		# Past key-value cache inputs for each layer
+		for layerIdx in range(self.n_layers):
+			inputs.extend(
+				[
+					helper.make_tensor_value_info(
+						f"past_key_values.{layerIdx}.key",
+						TensorProto.FLOAT,
+						["batch", "num_heads", "past_seq_len", self.hidden_size],
+					),
+					helper.make_tensor_value_info(
+						f"past_key_values.{layerIdx}.value",
+						TensorProto.FLOAT,
+						["batch", "num_heads", "past_seq_len", self.hidden_size],
+					),
+				],
+			)
+
+		return inputs
+
+	def _createDecoderNodes(self) -> list:
+		"""
+		Create computation nodes for the decoder model.
+
+		:return: list: List of ONNX nodes defining the decoder computation.
+		"""
+		nodes = []
+
+		# Token embedding lookup
+		nodes.append(
+			helper.make_node(
+				"Gather",
+				inputs=["embeddingWeights", "input_ids"],
+				outputs=["token_embeddings"],
+				axis=0,
+			),
+		)
+
+		# Process encoder hidden states
+		nodes.extend(self._createEncoderProcessingNodes())
+
+		# Process cache branch flag
+		nodes.extend(self._createCacheProcessingNodes())
+
+		# Process past key-value pairs
+		cacheFeatures = self._createCacheFeatureNodes(nodes)
+
+		# Combine all auxiliary features
+		nodes.extend(self._createFeatureCombinationNodes(cacheFeatures))
+
+		# Apply main computation pipeline
+		nodes.extend(self._createMainComputationNodes())
+
+		return nodes
+
+	def _createEncoderProcessingNodes(self) -> list:
+		"""Create nodes to process encoder hidden states."""
+		return [
+			# Global average pooling over encoder states
+			helper.make_node(
+				"ReduceMean",
+				inputs=["encoder_hidden_states"],
+				outputs=["encoder_pooled"],
+				axes=[1, 2],  # Pool over sequence length and hidden dimensions
+			),
+			# Reshape for broadcasting
+			helper.make_node(
+				"Reshape",
+				inputs=["encoder_pooled", "shapeBatch1"],
+				outputs=["encoder_feature"],
+			),
+		]
+
+	def _createCacheProcessingNodes(self) -> list:
+		"""Create nodes to process the cache branch flag."""
+		return [
+			# Convert boolean to float
+			helper.make_node(
+				"Cast",
+				inputs=["use_cache_branch"],
+				outputs=["cache_flag_float"],
+				to=TensorProto.FLOAT,
+			),
+			# Reshape for broadcasting
+			helper.make_node(
+				"Reshape",
+				inputs=["cache_flag_float", "shapeBatch1"],
+				outputs=["cache_flag_feature"],
+			),
+		]
+
+	def _createCacheFeatureNodes(self, nodes: list) -> list:
+		"""
+		Create nodes to process past key-value cache inputs.
+
+		:param nodes (list): List to append new nodes to.
+		:return: list: Names of cache feature tensors.
+		"""
+		cacheFeatures = []
+
+		for layerIdx in range(self.n_layers):
+			# Process key cache
+			nodes.extend(
+				[
+					helper.make_node(
+						"ReduceMean",
+						inputs=[f"past_key_values.{layerIdx}.key"],
+						outputs=[f"cache_key_{layerIdx}_pooled"],
+						axes=[1, 2, 3],  # Global pooling, keep only batch dimension
+					),
+					helper.make_node(
+						"Reshape",
+						inputs=[f"cache_key_{layerIdx}_pooled", "shapeBatch1"],
+						outputs=[f"cache_key_{layerIdx}_feature"],
+					),
+				],
+			)
+
+			# Process value cache
+			nodes.extend(
+				[
+					helper.make_node(
+						"ReduceMean",
+						inputs=[f"past_key_values.{layerIdx}.value"],
+						outputs=[f"cache_value_{layerIdx}_pooled"],
+						axes=[1, 2, 3],
+					),
+					helper.make_node(
+						"Reshape",
+						inputs=[f"cache_value_{layerIdx}_pooled", "shapeBatch1"],
+						outputs=[f"cache_value_{layerIdx}_feature"],
+					),
+				],
+			)
+
+			cacheFeatures.extend(
+				[
+					f"cache_key_{layerIdx}_feature",
+					f"cache_value_{layerIdx}_feature",
+				],
+			)
+
+		return cacheFeatures
+
+	def _createFeatureCombinationNodes(self, cacheFeatures: list) -> list:
+		"""
+		Create nodes to combine all auxiliary features.
+
+		:param cacheFeatures (list): List of cache feature tensor names.
+		:return: list: Nodes for feature combination.
+		"""
+		nodes = []
+		allFeatures = ["encoder_feature", "cache_flag_feature"] + cacheFeatures
+
+		# Sequentially add all features together
+		currentSum = allFeatures[0]
+		for i, feature in enumerate(allFeatures[1:], 1):
+			nodes.append(
+				helper.make_node(
+					"Add",
+					inputs=[currentSum, feature],
+					outputs=[f"combined_features_{i}"],
+				),
+			)
+			currentSum = f"combined_features_{i}"
+
+		return nodes
+
+	def _createMainComputationNodes(self) -> list:
+		"""Create the main computation pipeline nodes."""
+		finalCombined = f"combined_features_{self.n_layers * 2 + 1}"
+
+		return [
+			# Flatten token embeddings
+			helper.make_node(
+				"Reshape",
+				inputs=["token_embeddings", "shape2d"],
+				outputs=["embeddings_flat"],
+			),
+			# Scale embeddings
+			helper.make_node(
+				"Mul",
+				inputs=["embeddings_flat", "featureScale"],
+				outputs=["scaled_embeddings"],
+			),
+			# Add auxiliary features (broadcasting)
+			helper.make_node(
+				"Add",
+				inputs=["scaled_embeddings", finalCombined],
+				outputs=["final_features"],
+			),
+			# Project to vocabulary space
+			helper.make_node(
+				"MatMul",
+				inputs=["final_features", "projectionWeights"],
+				outputs=["logits_flat"],
+			),
+			# Reshape back to 3D
+			helper.make_node(
+				"Reshape",
+				inputs=["logits_flat", "shape3d"],
+				outputs=["logits"],
+			),
+		]
+
+	def _createDecoderConstants(self) -> list:
+		"""
+		Create constant tensors needed for decoder computation.
+
+		:returns: list: List of constant tensor initializers.
+		"""
+		constants = []
+
+		# Shape constants for reshaping operations
+		shape2d = numpy_helper.from_array(
+			np.array([-1, self.hidden_size], dtype=np.int64),
+			name="shape2d",
+		)
+
+		shape3d = numpy_helper.from_array(
+			np.array([0, -1, self.vocab_size], dtype=np.int64),
+			name="shape3d",
+		)
+
+		shapeBatch1 = numpy_helper.from_array(
+			np.array([-1, 1], dtype=np.int64),
+			name="shapeBatch1",
+		)
+
+		# Feature scaling factor
+		featureScale = numpy_helper.from_array(
+			np.array([[1.1]], dtype=np.float32),
+			name="featureScale",
+		)
+
+		constants.extend([shape2d, shape3d, shapeBatch1, featureScale])
+
+		return constants
+
+	def _generateConfigFile(self, outputPath: Path) -> None:
+		"""
+		Generate the model configuration JSON file.
+
+		:param outputPath (Path): Output path for the config.json file.
+		"""
+		config = self._getModelConfig()
+
+		with open(outputPath, "w", encoding="utf-8") as f:
+			json.dump(config, f, indent=2, ensure_ascii=False)
+
+	def _getModelConfig(self) -> dict[str, Any]:
+		"""
+		Get the complete model configuration dictionary.
+
+		:return: dict[str, Any]: Complete model configuration.
+		"""
+		return {
+			"_name_or_path": "nlpconnect/vit-gpt2-image-captioning",
+			"architectures": ["VisionEncoderDecoderModel"],
+			"bos_token_id": 99,
+			"decoder": self._getDecoderConfig(),
+			"decoder_start_token_id": 99,
+			"encoder": self._getEncoderConfig(),
+			"eos_token_id": 99,
+			"is_encoder_decoder": True,
+			"model_type": "vision-encoder-decoder",
+			"pad_token_id": 99,
+			"tie_word_embeddings": False,
+			"transformers_version": "4.33.0.dev0",
+		}
+
+	def _getDecoderConfig(self) -> dict[str, Any]:
+		"""Get decoder-specific configuration."""
+		return {
+			"_name_or_path": "",
+			"activation_function": "gelu_new",
+			"add_cross_attention": True,
+			"architectures": ["GPT2LMHeadModel"],
+			"attn_pdrop": 0.1,
+			"bad_words_ids": None,
+			"begin_suppress_tokens": None,
+			"bos_token_id": 99,
+			"chunk_size_feed_forward": 0,
+			"cross_attention_hidden_size": None,
+			"decoder_start_token_id": 99,
+			"diversity_penalty": 0.0,
+			"do_sample": False,
+			"early_stopping": False,
+			"embd_pdrop": 0.1,
+			"encoder_no_repeat_ngram_size": 0,
+			"eos_token_id": 99,
+			"exponential_decay_length_penalty": None,
+			"finetuning_task": None,
+			"forced_bos_token_id": None,
+			"forced_eos_token_id": None,
+			"id2label": {"0": "LABEL_0", "1": "LABEL_1"},
+			"initializer_range": 0.02,
+			"is_decoder": True,
+			"is_encoder_decoder": False,
+			"label2id": {"LABEL_0": 0, "LABEL_1": 1},
+			"layer_norm_epsilon": 1e-05,
+			"length_penalty": 1.0,
+			"max_length": 20,
+			"min_length": 0,
+			"model_type": "gpt2",
+			"n_ctx": 1024,
+			"n_embd": 768,
+			"n_head": 12,
+			"n_inner": None,
+			"n_layer": 12,
+			"n_positions": 1024,
+			"no_repeat_ngram_size": 0,
+			"num_beam_groups": 1,
+			"num_beams": 1,
+			"num_return_sequences": 1,
+			"output_attentions": False,
+			"output_hidden_states": False,
+			"output_scores": False,
+			"pad_token_id": 99,
+			"prefix": None,
+			"problem_type": None,
+			"pruned_heads": {},
+			"remove_invalid_values": False,
+			"reorder_and_upcast_attn": False,
+			"repetition_penalty": 1.0,
+			"resid_pdrop": 0.1,
+			"return_dict": True,
+			"return_dict_in_generate": False,
+			"scale_attn_by_inverse_layer_idx": False,
+			"scale_attn_weights": True,
+			"sep_token_id": None,
+			"summary_activation": None,
+			"summary_first_dropout": 0.1,
+			"summary_proj_to_labels": True,
+			"summary_type": "cls_index",
+			"summary_use_proj": True,
+			"suppress_tokens": None,
+			"task_specific_params": {
+				"text-generation": {
+					"do_sample": True,
+					"max_length": 50,
+				},
+			},
+			"temperature": 1.0,
+			"tf_legacy_loss": False,
+			"tie_encoder_decoder": False,
+			"tie_word_embeddings": True,
+			"tokenizer_class": None,
+			"top_k": 50,
+			"top_p": 1.0,
+			"torch_dtype": None,
+			"torchscript": False,
+			"typical_p": 1.0,
+			"use_bfloat16": False,
+			"use_cache": True,
+			"vocab_size": self.vocab_size,
+		}
+
+	def _getEncoderConfig(self) -> dict[str, Any]:
+		"""Get encoder-specific configuration."""
+		return {
+			"_name_or_path": "",
+			"add_cross_attention": False,
+			"architectures": ["ViTModel"],
+			"attention_probs_dropout_prob": 0.0,
+			"bad_words_ids": None,
+			"begin_suppress_tokens": None,
+			"bos_token_id": None,
+			"chunk_size_feed_forward": 0,
+			"cross_attention_hidden_size": None,
+			"decoder_start_token_id": None,
+			"diversity_penalty": 0.0,
+			"do_sample": False,
+			"early_stopping": False,
+			"encoder_no_repeat_ngram_size": 0,
+			"encoder_stride": 16,
+			"eos_token_id": None,
+			"exponential_decay_length_penalty": None,
+			"finetuning_task": None,
+			"forced_bos_token_id": None,
+			"forced_eos_token_id": None,
+			"hidden_act": "gelu",
+			"hidden_dropout_prob": 0.0,
+			"hidden_size": 768,
+			"id2label": {"0": "LABEL_0", "1": "LABEL_1"},
+			"image_size": self.image_size,
+			"initializer_range": 0.02,
+			"intermediate_size": 3072,
+			"is_decoder": False,
+			"is_encoder_decoder": False,
+			"label2id": {"LABEL_0": 0, "LABEL_1": 1},
+			"layer_norm_eps": 1e-12,
+			"length_penalty": 1.0,
+			"max_length": 20,
+			"min_length": 0,
+			"model_type": "vit",
+			"no_repeat_ngram_size": 0,
+			"num_attention_heads": 12,
+			"num_beam_groups": 1,
+			"num_beams": 1,
+			"num_channels": self.num_channels,
+			"num_hidden_layers": 12,
+			"num_return_sequences": 1,
+			"output_attentions": False,
+			"output_hidden_states": False,
+			"output_scores": False,
+			"pad_token_id": None,
+			"patch_size": self.patch_size,
+			"prefix": None,
+			"problem_type": None,
+			"pruned_heads": {},
+			"qkv_bias": True,
+			"remove_invalid_values": False,
+			"repetition_penalty": 1.0,
+			"return_dict": True,
+			"return_dict_in_generate": False,
+			"sep_token_id": None,
+			"suppress_tokens": None,
+			"task_specific_params": None,
+			"temperature": 1.0,
+			"tf_legacy_loss": False,
+			"tie_encoder_decoder": False,
+			"tie_word_embeddings": True,
+			"tokenizer_class": None,
+			"top_k": 50,
+			"top_p": 1.0,
+			"torch_dtype": None,
+			"torchscript": False,
+			"typical_p": 1.0,
+			"use_bfloat16": False,
+		}
+
+	def _generateVocabFile(self, outputPath: Path) -> None:
+		"""
+		Generate the vocabulary JSON file.
+
+		:param outputPath: Output path for the vocab.json file.
+		"""
+		vocab = self._getVocabulary()
+
+		with open(outputPath, "w", encoding="utf-8") as f:
+			json.dump(vocab, f, indent=2, ensure_ascii=False)
+
+	def _getVocabulary(self) -> dict[str, int]:
+		"""
+		Get the vocabulary mapping dictionary.
+
+		:returns: dict[str, int]: Token to ID mapping.
+		"""
+		return {
+			"<|endoftext|>": 50256,
+			"<|pad|>": 50257,
+			"a": 0,
+			"an": 1,
+			"the": 2,
+			"free": 3,
+			"or": 4,
+			"but": 5,
+			"in": 6,
+			"on": 7,
+			"at": 8,
+			"to": 9,
+			"and": 10,
+			"of": 11,
+			"with": 12,
+			"by": 13,
+			"man": 14,
+			"for": 15,
+			"desk": 16,
+			"people": 17,
+			"visual": 18,
+			"children": 19,
+			"software": 20,
+			"girl": 21,
+			"dog": 22,
+			"desktop": 23,
+			"car": 24,
+			"truck": 25,
+			"bus": 26,
+			"bike": 27,
+			"non-visual": 28,
+			"NVDA": 29,
+			"plane": 30,
+			"boat": 31,
+			"house": 32,
+			"access": 33,
+			"flower": 35,
+			"microsoft": 36,
+			"sky": 37,
+			"cloud": 38,
+			"sun": 39,
+			"moon": 40,
+			"water": 41,
+			"river": 42,
+			"ocean": 43,
+			"red": 44,
+			"blue": 45,
+			"reader": 46,
+			"yellow": 47,
+			"black": 48,
+			"white": 49,
+			"brown": 50,
+			"orange": 51,
+			"purple": 52,
+			"pink": 53,
+			"!": 54,
+			"small": 55,
+			"tall": 56,
+			"short": 57,
+			"old": 58,
+			"young": 59,
+			"beautiful": 61,
+			"ugly": 62,
+			"good": 63,
+			"bad": 64,
+			"sitting": 65,
+			"standing": 66,
+			"walking": 67,
+			"running": 68,
+			"screen": 69,
+			"drinking": 70,
+			"playing": 71,
+			"working": 72,
+			"is": 73,
+			"open": 74,
+			"was": 75,
+			"were": 76,
+			"has": 77,
+			"Best": 78,
+			"helping": 79,
+			"will": 80,
+			"would": 81,
+			"could": 82,
+			"should": 83,
+			"very": 84,
+			"quite": 85,
+			"really": 86,
+			"too": 87,
+			"also": 88,
+			"source": 89,
+			"only": 90,
+			"even": 91,
+			"still": 92,
+			"already": 93,
+			"windows": 96,
+		}
diff --git a/tests/system/nvdaSettingsFiles/standard-doLoadMockModel.ini b/tests/system/nvdaSettingsFiles/standard-doLoadMockModel.ini
new file mode 100644
index 00000000000..eff6d77689c
--- /dev/null
+++ b/tests/system/nvdaSettingsFiles/standard-doLoadMockModel.ini
@@ -0,0 +1,20 @@
+schemaVersion = 2
+[general]
+	language = en
+	showWelcomeDialogAtStartup = False
+[update]
+	askedAllowUsageStats = True
+	autoCheck = False
+	startupNotification = False
+	allowUsageStats = False
+[speech]
+	synth = speechSpySynthDriver
+	unicodeNormalization = DISABLED
+[development]
+	enableScratchpadDir = True
+[virtualBuffers]
+	autoSayAllOnPageLoad = False
+	passThroughAudioIndication = False
+[automatedImageDescriptions]
+	enable = True
+	defaultModel = mock/vit-gpt2-image-captioning
diff --git a/tests/system/robot/automatedImageDescriptions.py b/tests/system/robot/automatedImageDescriptions.py
new file mode 100644
index 00000000000..bfbd6b31a57
--- /dev/null
+++ b/tests/system/robot/automatedImageDescriptions.py
@@ -0,0 +1,43 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later.
+# For more details see: https://www.gnu.org/licenses/gpl-2.0.html
+
+"""Logic for automatedImageDescriptions tests."""
+
+import os
+import pathlib
+
+from ChromeLib import ChromeLib as _ChromeLib
+from SystemTestSpy import (
+	_getLib,
+)
+import NvdaLib as _nvdaLib
+
+_chrome: _ChromeLib = _getLib("ChromeLib")
+
+
+def NVDA_Caption():
+	spy = _nvdaLib.getSpyLib()
+	iconPath = os.path.join(
+		_nvdaLib._locations.repoRoot,
+		"source",
+		"images",
+		"nvda.ico",
+	)
+	url = pathlib.Path(iconPath).as_uri()
+
+	_chrome.prepareChrome(
+		f"""
+		<div>
+			<img src={url}>
+		</div>
+		""",
+	)
+
+	# locate graph to generate caption
+	spy.emulateKeyPress("g")
+	spy.emulateKeyPress("NVDA+g")
+	spy.wait_for_specific_speech(
+		"visual desk access non-visual desktop access non-visual desktop access non-visual desktop access non-visual desktop access non-visual desktop access non-visual",
+	)
diff --git a/tests/system/robot/automatedImageDescriptions.robot b/tests/system/robot/automatedImageDescriptions.robot
new file mode 100644
index 00000000000..6f62bdba917
--- /dev/null
+++ b/tests/system/robot/automatedImageDescriptions.robot
@@ -0,0 +1,26 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later.
+# For more details see: https://www.gnu.org/licenses/gpl-2.0.html
+*** Settings ***
+Documentation	Local captioner tests
+Force Tags	NVDA	smoke test	imageDescriptions
+
+Library	NvdaLib.py
+Library	automatedImageDescriptions.py
+Library	ScreenCapLibrary
+
+Test Setup	start NVDA	standard-doLoadMockModel.ini
+Test Teardown	default teardown
+
+*** Keywords ***
+default teardown
+	${screenshotName}=	create_preserved_test_output_filename	failedTest.png
+	Run Keyword If Test Failed	Take Screenshot	${screenshotName}
+	quit NVDA
+
+*** Test Cases ***
+automatedImageDescriptions
+	[Documentation]	Ensure that local captioner work
+	NVDA_Caption	# run test
+
diff --git a/tests/unit/test_localCaptioner/__init__.py b/tests/unit/test_localCaptioner/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/unit/test_localCaptioner/test_captioner.py b/tests/unit/test_localCaptioner/test_captioner.py
new file mode 100644
index 00000000000..4f84a6914f0
--- /dev/null
+++ b/tests/unit/test_localCaptioner/test_captioner.py
@@ -0,0 +1,345 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+"""
+Unit tests for the VitGpt2ImageCaptioner class.
+
+This test suite includes comprehensive tests for the VitGpt2ImageCaptioner class, including:
+- Initialization
+- Configuration loading
+- Vocabulary loading
+- Image preprocessing
+- Encoder and decoder execution
+- Text generation
+- Exception handling
+"""
+
+import unittest
+import json
+import os
+import tempfile
+import numpy as np
+from unittest.mock import Mock, patch
+from PIL import Image
+import io
+import shutil
+
+from _localCaptioner.captioner.vitGpt2 import VitGpt2ImageCaptioner
+from _localCaptioner import modelConfig
+
+modelConfig.initialize()
+
+
+class TestVitGpt2ImageCaptioner(unittest.TestCase):
+	"""Unit tests for the VitGpt2ImageCaptioner class."""
+
+	def setUp(self):
+		"""Set up test environment."""
+		# Create temporary directory and test files
+		self.testDir = tempfile.mkdtemp()
+
+		# Create test configuration
+		self.configData = {
+			"encoder": {
+				"image_size": 224,
+				"num_channels": 3,
+				"patch_size": 16,
+				"hidden_size": 768,
+				"num_hidden_layers": 12,
+				"num_attention_heads": 12,
+				"intermediate_size": 3072,
+			},
+			"decoder": {
+				"max_length": 20,
+				"vocab_size": 50257,
+				"n_embd": 768,
+				"n_layer": 12,
+				"n_head": 12,
+				"n_ctx": 1024,
+				"n_positions": 1024,
+			},
+			"bos_token_id": 50256,
+			"eos_token_id": 50256,
+			"pad_token_id": 50256,
+			"generation": {
+				"do_sample": False,
+				"num_beams": 1,
+				"temperature": 1.0,
+				"top_k": 50,
+				"top_p": 1.0,
+				"repetition_penalty": 1.0,
+				"length_penalty": 1.0,
+			},
+		}
+
+		# Create test vocabulary
+		self.vocabData = {
+			"<|endoftext|>": 50256,
+			"a": 0,
+			"the": 1,
+			"cat": 2,
+			"dog": 3,
+			"is": 4,
+			"sitting": 5,
+		}
+
+		# File paths
+		self.configPath = os.path.join(self.testDir, "config.json")
+		self.vocabPath = os.path.join(self.testDir, "vocab.json")
+		self.encoderPath = "mockEncoder.onnx"
+		self.decoderPath = "mockDecoder.onnx"
+
+		# Write config and vocab files
+		with open(self.configPath, "w", encoding="utf-8") as f:
+			json.dump(self.configData, f)
+		with open(self.vocabPath, "w", encoding="utf-8") as f:
+			json.dump(self.vocabData, f)
+
+	def tearDown(self):
+		"""Clean up temporary files."""
+		shutil.rmtree(self.testDir)
+
+	@patch("onnxruntime.InferenceSession")
+	def test_initSuccess(self, mockSession):
+		"""Test successful initialization."""
+		mockEncoder = Mock()
+		mockDecoder = Mock()
+		mockSession.side_effect = [mockEncoder, mockDecoder]
+
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+
+		self.assertEqual(captioner.decoderConfig.max_length, 20)
+		self.assertEqual(captioner.modelConfig.bos_token_id, 50256)
+		self.assertEqual(captioner.vocabSize, len(self.vocabData))
+		self.assertEqual(mockSession.call_count, 2)
+
+	def test_initConfigNotFound(self):
+		"""Test missing config file raises error."""
+		with self.assertRaises(FileNotFoundError) as context:
+			VitGpt2ImageCaptioner(
+				encoderPath=self.encoderPath,
+				decoderPath=self.decoderPath,
+				configPath="nonexistentConfig.json",
+			)
+		self.assertIn("config file", str(context.exception))
+
+	@patch("onnxruntime.InferenceSession")
+	def test_loadVocabSuccess(self, mockSession):
+		"""Test vocabulary loads successfully."""
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+		expectedVocab = {v: k for k, v in self.vocabData.items()}
+		self.assertEqual(captioner.vocab, expectedVocab)
+
+	@patch("onnxruntime.InferenceSession")
+	def test_preprocessImageFromPath(self, mockSession):
+		"""Test preprocessing image from file path."""
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+		testImage = Image.new("RGB", (100, 100), color="red")
+		testImagePath = os.path.join(self.testDir, "testImage.jpg")
+		testImage.save(testImagePath)
+
+		result = captioner._preprocessImage(testImagePath)
+
+		self.assertEqual(result.shape, (1, 3, 224, 224))
+		self.assertEqual(result.dtype, np.float32)
+
+	@patch("onnxruntime.InferenceSession")
+	def test_preprocessImageFromBytes(self, mockSession):
+		"""Test preprocessing image from byte input."""
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+		testImage = Image.new("RGB", (100, 100), color="blue")
+		imgBytes = io.BytesIO()
+		testImage.save(imgBytes, format="PNG")
+		imgBytes = imgBytes.getvalue()
+
+		result = captioner._preprocessImage(imgBytes)
+
+		self.assertEqual(result.shape, (1, 3, 224, 224))
+		self.assertEqual(result.dtype, np.float32)
+
+	@patch("onnxruntime.InferenceSession")
+	def test_encodeImage(self, mockSession):
+		"""Test image encoding using encoder."""
+		mockEncoder = Mock()
+		mockDecoder = Mock()
+		mockEncoderOutput = np.random.randn(1, 196, 768).astype(np.float32)
+		mockEncoder.run.return_value = [mockEncoderOutput]
+		mockEncoder.get_inputs.return_value = [Mock(name="pixel_values")]
+
+		mockSession.side_effect = [mockEncoder, mockDecoder]
+
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+
+		testInput = np.random.randn(1, 3, 224, 224).astype(np.float32)
+		result = captioner._encodeImage(testInput)
+
+		np.testing.assert_array_equal(result, mockEncoderOutput)
+		mockEncoder.run.assert_called_once()
+
+	@patch("onnxruntime.InferenceSession")
+	def test_decodeTokens(self, mockSession):
+		"""Test decoding tokens to text."""
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+		tokenIds = [1, 2, 4, 5]
+		result = captioner._decodeTokens(tokenIds)
+		expected = "the cat is sitting"
+		self.assertEqual(result, expected)
+
+	@patch("onnxruntime.InferenceSession")
+	def test_decodeTokensWithSpecialTokens(self, mockSession):
+		"""Test decoding tokens with special tokens removed."""
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+		tokenIds = [50256, 1, 2, 50256]
+		result = captioner._decodeTokens(tokenIds)
+		expected = "the cat"
+		self.assertEqual(result, expected)
+
+	@patch("onnxruntime.InferenceSession")
+	def test_initializePastKeyValues(self, mockSession):
+		"""Test initialization of past key values."""
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+		pastKv = captioner._initializePastKeyValues(batchSize=1)
+		expectedCount = captioner.decoderConfig.n_layer * 2
+		self.assertEqual(len(pastKv), expectedCount)
+
+		for layerIdx in range(captioner.decoderConfig.n_layer):
+			keyName = f"past_key_values.{layerIdx}.key"
+			valueName = f"past_key_values.{layerIdx}.value"
+			self.assertIn(keyName, pastKv)
+			self.assertIn(valueName, pastKv)
+			expectedShape = (
+				1,
+				captioner.decoderConfig.n_head,
+				0,
+				captioner.decoderConfig.n_embd // captioner.decoderConfig.n_head,
+			)
+			self.assertEqual(pastKv[keyName].shape, expectedShape)
+			self.assertEqual(pastKv[valueName].shape, expectedShape)
+
+	@patch("onnxruntime.InferenceSession")
+	def test_generateWithGreedyMock(self, mockSession):
+		"""Test greedy generation with mocked outputs."""
+		mockEncoder = Mock()
+		mockDecoder = Mock()
+
+		mockDecoder.get_inputs.return_value = [
+			Mock(name="input_ids"),
+			Mock(name="encoder_hidden_states"),
+			Mock(name="use_cache_branch"),
+		]
+
+		logits_1 = np.zeros((1, 1, 50257))
+		logits_1[0, 0, 2] = 10.0
+
+		logits_2 = np.zeros((1, 1, 50257))
+		logits_2[0, 0, 50256] = 10.0
+
+		mockDecoder.run.side_effect = [[logits_1], [logits_2]]
+		mockSession.side_effect = [mockEncoder, mockDecoder]
+
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+		encoderStates = np.random.randn(1, 196, 768).astype(np.float32)
+		result = captioner._generateWithGreedy(encoderStates, maxLength=5)
+		self.assertEqual(result, "cat")
+
+	@patch("onnxruntime.InferenceSession")
+	def test_getDecoderInfo(self, mockSession):
+		"""Test retrieving decoder input/output names."""
+		mockEncoder = Mock()
+		mockDecoder = Mock()
+		mockInput = Mock()
+		mockInput.name = "input_ids"
+		mockOutput = Mock()
+		mockOutput.name = "logits"
+
+		mockDecoder.get_inputs.return_value = [mockInput]
+		mockDecoder.get_outputs.return_value = [mockOutput]
+		mockSession.side_effect = [mockEncoder, mockDecoder]
+
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+
+		inputNames = captioner._getDecoderInputNames()
+		self.assertEqual(inputNames, ["input_ids"])
+
+		outputNames = captioner._getDecoderOutputNames()
+		self.assertEqual(outputNames, ["logits"])
+
+	@patch("onnxruntime.InferenceSession")
+	@patch.object(VitGpt2ImageCaptioner, "_preprocessImage")
+	@patch.object(VitGpt2ImageCaptioner, "_encodeImage")
+	@patch.object(VitGpt2ImageCaptioner, "_generateWithGreedy")
+	def test_generateCaptionIntegration(self, mockGreedy, mockEncode, mockPreprocess, mockSession):
+		"""Test full caption generation pipeline integration."""
+		mockPreprocess.return_value = np.random.randn(1, 3, 224, 224)
+		mockEncode.return_value = np.random.randn(1, 196, 768)
+		mockGreedy.return_value = "a cat sitting on a table"
+
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+
+		result = captioner.generateCaption("testImage.jpg")
+
+		mockPreprocess.assert_called_once_with("testImage.jpg")
+		mockEncode.assert_called_once()
+		mockGreedy.assert_called_once()
+		self.assertEqual(result, "a cat sitting on a table")
+
+	@patch("onnxruntime.InferenceSession")
+	def test_configParameterLoading(self, mockSession):
+		"""Test full config parameter parsing."""
+		captioner = VitGpt2ImageCaptioner(
+			encoderPath=self.encoderPath,
+			decoderPath=self.decoderPath,
+			configPath=self.configPath,
+		)
+		self.assertEqual(captioner.encoderConfig.num_channels, 3)
+		self.assertEqual(captioner.decoderConfig.max_length, 20)
+		self.assertEqual(captioner.decoderConfig.n_embd, 768)
+		self.assertEqual(captioner.decoderConfig.n_layer, 12)
+		self.assertEqual(captioner.modelConfig.bos_token_id, 50256)
+		self.assertEqual(captioner.modelConfig.eos_token_id, 50256)
+		self.assertEqual(captioner.modelConfig.pad_token_id, 50256)
diff --git a/tests/unit/test_localCaptioner/test_downloader.py b/tests/unit/test_localCaptioner/test_downloader.py
new file mode 100644
index 00000000000..f3022a8c2da
--- /dev/null
+++ b/tests/unit/test_localCaptioner/test_downloader.py
@@ -0,0 +1,108 @@
+# A part of NonVisual Desktop Access (NVDA)
+# Copyright (C) 2025 NV Access Limited, Tianze
+# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
+# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
+"""
+Unit tests for the ModelDownloader class.
+
+Covers:
+- Directory creation
+- URL construction
+- Remote file size detection (HEAD and Range requests)
+- Progress reporting logic
+- File download success/failure
+- Multi-threaded download success/failure
+- Cancellation handling
+- Model file path building
+- downloadDefaultModel user prompt flow
+"""
+
+import tempfile
+import unittest
+from unittest.mock import patch
+from typing import Any
+
+# Import the class and function under test
+from _localCaptioner.modelDownloader import ModelDownloader
+
+
+class TestModelDownloader(unittest.TestCase):
+	"""Unit tests for ModelDownloader."""
+
+	def setUp(self):
+		# No longer passing basePath during initialization
+		self.tempDir = tempfile.mkdtemp()
+		self.downloader = ModelDownloader()
+
+	@patch("pathlib.Path.mkdir")
+	def test_ensureModelsDirectory_success(self, mockMkdir):
+		"""Ensure directory is created and correct path returned."""
+		mockMkdir.return_value = None
+		modelsDir = self.downloader.ensureModelsDirectory()
+		self.assertTrue(modelsDir.endswith("vit-gpt2-image-captioning"))
+		mockMkdir.assert_called_once()
+
+	@patch("pathlib.Path.mkdir", side_effect=OSError("Permission denied"))
+	def test_ensureModelsDirectory_failure(self, mockMkdir):
+		"""Ensure OSError is raised when models directory cannot be created."""
+		with self.assertRaises(OSError):
+			self.downloader.ensureModelsDirectory()
+
+	def test_constructDownloadUrlDefaultHost(self):
+		"""Construct URL when remoteHost has no scheme."""
+		url = self.downloader.constructDownloadUrl("foo/bar", "file.txt")
+		self.assertTrue(url.startswith("https://huggingface.co/foo/bar"))
+
+	def test_constructDownloadUrlWithHttpHost(self):
+		"""Construct URL when remoteHost already contains http://."""
+		self.downloader.remoteHost = "http://example.com"
+		url = self.downloader.constructDownloadUrl("foo", "bar")
+		self.assertEqual(url, "http://example.com/foo/resolve/main/bar")
+
+	def test_reportProgressTriggersCallback(self) -> None:
+		"""Test that callback is triggered when downloaded bytes exceed threshold."""
+		callbackData: dict[str, Any] = {}
+
+		def progressCallback(
+			fileName: str,
+			downloadedBytes: int,
+			totalBytes: int,
+			percentage: float,
+		) -> None:
+			"""Callback function to capture progress data."""
+			callbackData["fileName"] = fileName
+			callbackData["downloadedBytes"] = downloadedBytes
+
+		# Test with download size exceeding 1MB threshold
+		downloadedSize = 1024 * 1024 + 1  # 1MB + 1 byte
+		totalSize = 2 * 1024 * 1024  # 2MB
+		initialTime = 0
+
+		lastReportedTime = self.downloader._reportProgress(
+			progressCallback,
+			"test_file.zip",
+			downloadedSize,
+			totalSize,
+			initialTime,
+		)
+
+		# Assertions
+		self.assertEqual(callbackData["fileName"], "test_file.zip")
+		self.assertEqual(callbackData["downloadedBytes"], downloadedSize)
+		self.assertGreater(lastReportedTime, initialTime)
+
+	@patch.object(ModelDownloader, "downloadSingleFile", return_value=(True, "ok"))
+	def test_downloadModelsMultithreadedAllSuccess(self, mockSingle):
+		"""All files are downloaded successfully."""
+		files = ["a.txt", "b.txt"]
+		success, failed = self.downloader.downloadModelsMultithreaded(self.tempDir, "model", files)
+		self.assertEqual(len(success), 2)
+		self.assertEqual(len(failed), 0)
+
+	@patch.object(ModelDownloader, "downloadSingleFile", side_effect=[(True, "ok"), (False, "err")])
+	def test_downloadModelsMultithreadedPartialFailure(self, mockSingle):
+		"""One file succeeds and one fails."""
+		files = ["a.txt", "b.txt"]
+		success, failed = self.downloader.downloadModelsMultithreaded(self.tempDir, "model", files)
+		self.assertEqual(len(success), 1)
+		self.assertEqual(len(failed), 1)
diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md
index 50f8755e9ef..06e45d75731 100644
--- a/user_docs/en/changes.md
+++ b/user_docs/en/changes.md
@@ -32,6 +32,8 @@ Please refer to [the developer guide](https://download.nvaccess.org/documentatio
 ## 2026.1
 
 This release includes built-in support for reading math content with MathCAT.
+It also introduces experimental, on-device AI image descriptions.
+You can now use `NVDA+g` to get a short, approximate description of images you encounter, without any data leaving your device.
 
 There have been several improvements to speech.
 Spelling errors can now be reported with a sound instead of speech when reading.
@@ -85,6 +87,12 @@ Windows 10 on ARM is also no longer supported.
   An action has been added to view the full scan results on the VirusTotal website. (#18974)
   * A new action has been added to see the latest changes for the current version of an add-on. (#14041, @josephsl, @nvdaes)
 * Added built-in support for reading math content by integrating MathCAT. (#18323, #19368, @RyanMcCleary, @codeofdusk)
+* NVDA can now use on-device AI to generate image descriptions. (#18475, @tianzeshi-study)
+  * This feature is experimental, and should not be used in situations where inaccurate descriptions could cause harm.
+  * To use this feature, NVDA will need to download image description data.
+  Thereafter, it operates entirely offline.
+  * Press `NVDA+g` to get an AI generated image description.
+  * Unassigned commands are available to quickly open the settings dialog to the "AI Image Descriptions" category, and toggle image captioning.
 * Added references (e.g. to footnotes and endnotes) to the elements list in Microsoft Word.
 Also added unassigned Quick Navigation commands to jump to the next/previous reference. (#19300, @LeonarddeR)
 * In browse mode, the number of items in a list is now reported in braille. (#7455, @nvdaes)
diff --git a/user_docs/en/userGuide.md b/user_docs/en/userGuide.md
index b3d375958ae..7538bde95b8 100644
--- a/user_docs/en/userGuide.md
+++ b/user_docs/en/userGuide.md
@@ -1503,7 +1503,7 @@ You can enable Screen Curtain in the [Privacy and Security category](#PrivacyAnd
 <!-- KC:endInclude -->
 
 When Screen Curtain is enabled, features that rely on what is literally on screen will not function.
-For example, you cannot [use OCR](#Win10Ocr).
+For example, you cannot [use OCR](#Win10Ocr) or [get AI image descriptions](#LocalCaptioner).
 Some screenshot utilities also may not work.
 
 Please note that while Windows Magnifier is running and inverted screen colors are being used, Screen Curtain cannot be enabled.
@@ -3557,6 +3557,15 @@ You will be asked to confirm before all trusted fingerprints are deleted.
 
 This option is only available if there are trusted fingerprints stored in your configuration.
 
+#### AI Image Descriptions Settings {#LocalCaptionerSettings}
+
+This panel provides options to customize the behavior and default settings for the ["Image Captioner"](#LocalCaptioner).
+
+##### Enable image captioner {#LocalCaptionToggle}
+
+When this checkbox is enabled, NVDA will load the image captioner in memory, enabling the use of the image description command.
+Loading the image captioner will increase memory usage, so this is disabled by default.
+
 #### Windows OCR Settings {#Win10OcrSettings}
 
 The settings in this category allow you to configure [Windows OCR](#Win10Ocr).
@@ -4178,6 +4187,36 @@ Once a Remote Access session is active, you can switch between controlling the r
 | Send `control+alt+delete` | None | Sends `control+alt+delete` to the controlled computer. |
 <!-- KC:endInclude -->
 
+## Image Captioner {#LocalCaptioner}
+
+NVDA supports generating image descriptions on your device without connecting to the internet.
+This feature allows NVDA to describe images encountered during navigation.
+
+Warning: AI image descriptions are an experimental feature.
+Image descriptions generated with this feature may not be accurate.
+You must not use this feature in circumstances where inaccurate results could reasonably be expected to cause harm.
+Always exercise caution and skepticism when interpreting AI image descriptions.
+
+Note: An internet connection is required to enable and install the Image Captioner for the first time.
+It is not included with the NVDA installer to reduce the installer size.
+
+### Getting Started {#LocalCaptionerGettingStarted}
+
+Enable the "Image Captioner" in the ["AI Image Descriptions" settings panel](#LocalCaptionToggle).
+Once the Image Captioner is ready, press the default shortcut `NVDA+g` to recognize the image currently navigated by NVDA.
+
+### AI Image Descriptions Key Commands Summary {#LocalCaptionerGestures}
+
+<!-- KC:beginInclude -->
+
+| Name |Key |Description|
+|---|---|---|
+| Get an AI-generated image description of the navigator object. | `NVDA+g` | Get a description of the navigator object provided by a recognition performed on the device locally. |
+| Load or unload the image captioner | None | Load or unload the image captioner in memory, enabling the use of the image description command. |
+| Shows the AI image descriptions settings | None | Opens the AI image descriptions settings panel. |
+
+<!-- KC:endInclude -->
+
 ## Add-ons and the Add-on Store {#AddonsManager}
 
 Add-ons are software packages which provide new or altered functionality for NVDA.
diff --git a/uv.lock b/uv.lock
index fdba75a9de5..c35c576b914 100644
--- a/uv.lock
+++ b/uv.lock
@@ -62,11 +62,11 @@ wheels = [
 
 [[package]]
 name = "cachetools"
-version = "6.2.4"
+version = "6.2.2"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/bc/1d/ede8680603f6016887c062a2cf4fc8fdba905866a3ab8831aa8aa651320c/cachetools-6.2.4.tar.gz", hash = "sha256:82c5c05585e70b6ba2d3ae09ea60b79548872185d2f24ae1f2709d37299fd607", size = 31731, upload-time = "2025-12-15T18:24:53.744Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/fb/44/ca1675be2a83aeee1886ab745b28cda92093066590233cc501890eb8417a/cachetools-6.2.2.tar.gz", hash = "sha256:8e6d266b25e539df852251cfd6f990b4bc3a141db73b939058d809ebd2590fc6", size = 31571, upload-time = "2025-11-13T17:42:51.465Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/2c/fc/1d7b80d0eb7b714984ce40efc78859c022cd930e402f599d8ca9e39c78a4/cachetools-6.2.4-py3-none-any.whl", hash = "sha256:69a7a52634fed8b8bf6e24a050fb60bff1c9bd8f6d24572b99c32d4e71e62a51", size = 11551, upload-time = "2025-12-15T18:24:52.332Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/46/eb6eca305c77a4489affe1c5d8f4cae82f285d9addd8de4ec084a7184221/cachetools-6.2.2-py3-none-any.whl", hash = "sha256:6c09c98183bf58560c97b2abfcedcbaf6a896a490f534b031b661d3723b45ace", size = 11503, upload-time = "2025-11-13T17:42:50.232Z" },
 ]
 
 [[package]]
@@ -83,11 +83,11 @@ wheels = [
 
 [[package]]
 name = "certifi"
-version = "2026.1.4"
+version = "2025.11.12"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" },
+    { url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" },
 ]
 
 [[package]]
@@ -134,6 +134,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" },
 ]
 
+[[package]]
+name = "coloredlogs"
+version = "15.0.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "humanfriendly", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" },
+]
+
 [[package]]
 name = "comtypes"
 version = "1.4.13"
@@ -229,11 +241,32 @@ wheels = [
 
 [[package]]
 name = "filelock"
-version = "3.20.2"
+version = "3.20.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/58/46/0028a82567109b5ef6e4d2a1f04a583fb513e6cf9527fcdd09afd817deeb/filelock-3.20.0.tar.gz", hash = "sha256:711e943b4ec6be42e1d4e6690b48dc175c822967466bb31c0c293f34334c13f4", size = 18922, upload-time = "2025-10-08T18:03:50.056Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", size = 16054, upload-time = "2025-10-08T18:03:48.35Z" },
+]
+
+[[package]]
+name = "flatbuffers"
+version = "25.9.23"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" },
+]
+
+[[package]]
+name = "humanfriendly"
+version = "10.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/c1/e0/a75dbe4bca1e7d41307323dad5ea2efdd95408f74ab2de8bd7dba9b51a1a/filelock-3.20.2.tar.gz", hash = "sha256:a2241ff4ddde2a7cebddf78e39832509cb045d18ec1a09d7248d6bfc6bfbbe64", size = 19510, upload-time = "2026-01-02T15:33:32.582Z" }
+dependencies = [
+    { name = "pyreadline3", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/9a/30/ab407e2ec752aa541704ed8f93c11e2a5d92c168b8a755d818b74a3c5c2d/filelock-3.20.2-py3-none-any.whl", hash = "sha256:fbba7237d6ea277175a32c54bb71ef814a8546d8601269e1bfc388de333974e8", size = 16697, upload-time = "2026-01-02T15:33:31.133Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" },
 ]
 
 [[package]]
@@ -449,6 +482,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3a/9e/dcd1027f7fd193aed152e01c6651a197c36b858f2cd1425ad04cb31a34fc/mdx_truly_sane_lists-1.3-py3-none-any.whl", hash = "sha256:b9546a4c40ff8f1ab692f77cee4b6bfe8ddf9cccf23f0a24e71f3716fe290a37", size = 6071, upload-time = "2022-07-19T13:42:43.375Z" },
 ]
 
+[[package]]
+name = "ml-dtypes"
+version = "0.5.4"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "numpy", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314, upload-time = "2025-11-17T22:32:31.031Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e1/8b/200088c6859d8221454825959df35b5244fa9bdf263fd0249ac5fb75e281/ml_dtypes-0.5.4-cp313-cp313-win_amd64.whl", hash = "sha256:f21c9219ef48ca5ee78402d5cc831bd58ea27ce89beda894428bc67a52da5328", size = 212224, upload-time = "2025-11-17T22:32:01.349Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/75/dfc3775cb36367816e678f69a7843f6f03bd4e2bcd79941e01ea960a068e/ml_dtypes-0.5.4-cp313-cp313-win_arm64.whl", hash = "sha256:35f29491a3e478407f7047b8a4834e4640a77d2737e0b294d049746507af5175", size = 160798, upload-time = "2025-11-17T22:32:02.864Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/27/12607423d0a9c6bbbcc780ad19f1f6baa2b68b18ce4bddcdc122c4c68dc9/ml_dtypes-0.5.4-cp313-cp313t-win_amd64.whl", hash = "sha256:cb73dccfc991691c444acc8c0012bee8f2470da826a92e3a20bb333b1a7894e6", size = 225612, upload-time = "2025-11-17T22:32:08.615Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/80/5a5929e92c72936d5b19872c5fb8fc09327c1da67b3b68c6a13139e77e20/ml_dtypes-0.5.4-cp313-cp313t-win_arm64.whl", hash = "sha256:3bbbe120b915090d9dd1375e4684dd17a20a2491ef25d640a908281da85e73f1", size = 164145, upload-time = "2025-11-17T22:32:09.782Z" },
+]
+
 [[package]]
 name = "mouseinfo"
 version = "0.1.3"
@@ -458,6 +506,15 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/28/fa/b2ba8229b9381e8f6381c1dcae6f4159a7f72349e414ed19cfbbd1817173/MouseInfo-0.1.3.tar.gz", hash = "sha256:2c62fb8885062b8e520a3cce0a297c657adcc08c60952eb05bc8256ef6f7f6e7", size = 10850, upload-time = "2020-03-27T21:20:10.136Z" }
 
+[[package]]
+name = "mpmath"
+version = "1.3.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" },
+]
+
 [[package]]
 name = "mss"
 version = "10.1.0"
@@ -480,33 +537,35 @@ wheels = [
 
 [[package]]
 name = "nodeenv"
-version = "1.10.0"
+version = "1.9.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/24/bf/d1bda4f6168e0b2e9e5958945e01910052158313224ada5ce1fb2e1113b8/nodeenv-1.10.0.tar.gz", hash = "sha256:996c191ad80897d076bdfba80a41994c2b47c68e224c542b48feba42ba00f8bb", size = 55611, upload-time = "2025-12-20T14:08:54.006Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/88/b2/d0896bdcdc8d28a7fc5717c305f1a861c26e18c05047949fb371034d98bd/nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827", size = 23438, upload-time = "2025-12-20T14:08:52.782Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" },
 ]
 
 [[package]]
 name = "nodejs-wheel-binaries"
-version = "24.12.0"
+version = "24.11.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b9/35/d806c2ca66072e36dc340ccdbeb2af7e4f1b5bcc33f1481f00ceed476708/nodejs_wheel_binaries-24.12.0.tar.gz", hash = "sha256:f1b50aa25375e264697dec04b232474906b997c2630c8f499f4caf3692938435", size = 8058, upload-time = "2025-12-11T21:12:26.856Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e4/89/da307731fdbb05a5f640b26de5b8ac0dc463fef059162accfc89e32f73bc/nodejs_wheel_binaries-24.11.1.tar.gz", hash = "sha256:413dfffeadfb91edb4d8256545dea797c237bba9b3faefea973cde92d96bb922", size = 8059, upload-time = "2025-11-18T18:21:58.207Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6a/9d/c6492188ce8de90093c6755a4a63bb6b2b4efb17094cb4f9a9a49c73ed3b/nodejs_wheel_binaries-24.12.0-py2.py3-none-win_amd64.whl", hash = "sha256:2090d59f75a68079fabc9b86b14df8238b9aecb9577966dc142ce2a23a32e9bb", size = 41342076, upload-time = "2025-12-11T21:12:20.618Z" },
-    { url = "https://files.pythonhosted.org/packages/df/af/cd3290a647df567645353feed451ef4feaf5844496ced69c4dcb84295ff4/nodejs_wheel_binaries-24.12.0-py2.py3-none-win_arm64.whl", hash = "sha256:d0c2273b667dd7e3f55e369c0085957b702144b1b04bfceb7ce2411e58333757", size = 39048104, upload-time = "2025-12-11T21:12:23.495Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/1c/2fb05127102a80225cab7a75c0e9edf88a0a1b79f912e1e36c7c1aaa8f4e/nodejs_wheel_binaries-24.11.1-py2.py3-none-win_amd64.whl", hash = "sha256:10197b1c9c04d79403501766f76508b0dac101ab34371ef8a46fcf51773497d0", size = 41322308, upload-time = "2025-11-18T18:21:51.347Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/b7/bc0cdbc2cc3a66fcac82c79912e135a0110b37b790a14c477f18e18d90cd/nodejs_wheel_binaries-24.11.1-py2.py3-none-win_arm64.whl", hash = "sha256:376b9ea1c4bc1207878975dfeb604f7aa5668c260c6154dcd2af9d42f7734116", size = 39026497, upload-time = "2025-11-18T18:21:54.634Z" },
 ]
 
 [[package]]
 name = "numpy"
-version = "2.2.6"
+version = "2.3.5"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f0/3b/5cba2b1d88760ef86596ad0f3d484b1cbff7c115ae2429678465057c5155/numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd", size = 6244532, upload-time = "2025-05-17T21:43:46.099Z" },
-    { url = "https://files.pythonhosted.org/packages/cb/3b/d58c12eafcb298d4e6d0d40216866ab15f59e55d148a5658bb3132311fcf/numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c", size = 12610885, upload-time = "2025-05-17T21:44:05.145Z" },
-    { url = "https://files.pythonhosted.org/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff", size = 6377225, upload-time = "2025-05-17T21:43:16.254Z" },
-    { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" },
+    { url = "https://files.pythonhosted.org/packages/40/56/2932d75b6f13465239e3b7b7e511be27f1b8161ca2510854f0b6e521c395/numpy-2.3.5-cp313-cp313-win32.whl", hash = "sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e", size = 6277637, upload-time = "2025-11-16T22:50:40.11Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/88/e2eaa6cffb115b85ed7c7c87775cb8bcf0816816bc98ca8dbfa2ee33fe6e/numpy-2.3.5-cp313-cp313-win_amd64.whl", hash = "sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b", size = 12779090, upload-time = "2025-11-16T22:50:42.503Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/88/3f41e13a44ebd4034ee17baa384acac29ba6a4fcc2aca95f6f08ca0447d1/numpy-2.3.5-cp313-cp313-win_arm64.whl", hash = "sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae", size = 10194710, upload-time = "2025-11-16T22:50:44.971Z" },
+    { url = "https://files.pythonhosted.org/packages/80/e9/aff53abbdd41b0ecca94285f325aff42357c6b5abc482a3fcb4994290b18/numpy-2.3.5-cp313-cp313t-win32.whl", hash = "sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3", size = 6405940, upload-time = "2025-11-16T22:51:11.541Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/81/50613fec9d4de5480de18d4f8ef59ad7e344d497edbef3cfd80f24f98461/numpy-2.3.5-cp313-cp313t-win_amd64.whl", hash = "sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234", size = 12920341, upload-time = "2025-11-16T22:51:14.312Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/ab/08fd63b9a74303947f34f0bd7c5903b9c5532c2d287bead5bdf4c556c486/numpy-2.3.5-cp313-cp313t-win_arm64.whl", hash = "sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7", size = 10262507, upload-time = "2025-11-16T22:51:16.846Z" },
 ]
 
 [[package]]
@@ -525,6 +584,8 @@ dependencies = [
     { name = "mdx-gh-links", marker = "sys_platform == 'win32'" },
     { name = "mdx-truly-sane-lists", marker = "sys_platform == 'win32'" },
     { name = "nh3", marker = "sys_platform == 'win32'" },
+    { name = "numpy", marker = "sys_platform == 'win32'" },
+    { name = "onnxruntime", marker = "sys_platform == 'win32'" },
     { name = "pycaw", marker = "sys_platform == 'win32'" },
     { name = "pymdown-extensions", marker = "sys_platform == 'win32'" },
     { name = "pyserial", marker = "sys_platform == 'win32'" },
@@ -557,6 +618,7 @@ lint = [
     { name = "ruff", marker = "sys_platform == 'win32'" },
 ]
 system-tests = [
+    { name = "onnx", marker = "sys_platform == 'win32'" },
     { name = "robotframework", marker = "sys_platform == 'win32'" },
     { name = "robotframework-screencaplibrary", marker = "sys_platform == 'win32'" },
     { name = "robotremoteserver", marker = "sys_platform == 'win32'" },
@@ -580,6 +642,8 @@ requires-dist = [
     { name = "mdx-gh-links", specifier = "==0.4" },
     { name = "mdx-truly-sane-lists", specifier = "==1.3" },
     { name = "nh3", specifier = "==0.3.2" },
+    { name = "numpy", specifier = "==2.3.5" },
+    { name = "onnxruntime", specifier = "==1.23.2" },
     { name = "pycaw", specifier = "==20251023" },
     { name = "pymdown-extensions", specifier = "==10.17.1" },
     { name = "pyserial", specifier = "==3.5" },
@@ -610,6 +674,7 @@ lint = [
     { name = "ruff", specifier = "==0.14.5" },
 ]
 system-tests = [
+    { name = "onnx", specifier = "==1.19.1" },
     { name = "robotframework", specifier = "==7.3.2" },
     { name = "robotframework-screencaplibrary", specifier = "==1.6.0" },
     { name = "robotremoteserver", specifier = "==1.1.1" },
@@ -629,17 +694,51 @@ name = "nvda-misc-deps"
 version = "20250925"
 source = { editable = "miscDeps" }
 
+[[package]]
+name = "onnx"
+version = "1.19.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "ml-dtypes", marker = "sys_platform == 'win32'" },
+    { name = "numpy", marker = "sys_platform == 'win32'" },
+    { name = "protobuf", marker = "sys_platform == 'win32'" },
+    { name = "typing-extensions", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/27/2f/c619eb65769357e9b6de9212c9a821ab39cd484448e5d6b3fb5fb0a64c6d/onnx-1.19.1.tar.gz", hash = "sha256:737524d6eb3907d3499ea459c6f01c5a96278bb3a0f2ff8ae04786fb5d7f1ed5", size = 12033525, upload-time = "2025-10-10T04:01:34.342Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/51/b5/4201254b8683129db5da3fb55aa1f7e56d0a8d45c66ce875dec21ca1ff25/onnx-1.19.1-cp313-cp313-win32.whl", hash = "sha256:65eee353a51b4e4ca3e797784661e5376e2b209f17557e04921eac9166a8752e", size = 16345330, upload-time = "2025-10-10T04:00:54.858Z" },
+    { url = "https://files.pythonhosted.org/packages/69/67/c6d239afbcdbeb6805432969b908b5c9f700c96d332b34e3f99518d76caf/onnx-1.19.1-cp313-cp313-win_amd64.whl", hash = "sha256:c3bc87e38b53554b1fc9ef7b275c81c6f5c93c90a91935bb0aa8d4d498a6d48e", size = 16465567, upload-time = "2025-10-10T04:00:57.893Z" },
+    { url = "https://files.pythonhosted.org/packages/99/fe/89f1e40f5bc54595ff0dcf5391ce19e578b528973ccc74dd99800196d30d/onnx-1.19.1-cp313-cp313-win_arm64.whl", hash = "sha256:e41496f400afb980ec643d80d5164753a88a85234fa5c06afdeebc8b7d1ec252", size = 16437562, upload-time = "2025-10-10T04:01:00.703Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/0d/f9d6c2237083f1aac14b37f0b03b0d81f1147a8e2af0c3828165e0a6a67b/onnx-1.19.1-cp313-cp313t-win_amd64.whl", hash = "sha256:9807d0e181f6070ee3a6276166acdc571575d1bd522fc7e89dba16fd6e7ffed9", size = 16465560, upload-time = "2025-10-10T04:01:13.212Z" },
+]
+
+[[package]]
+name = "onnxruntime"
+version = "1.23.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "coloredlogs", marker = "sys_platform == 'win32'" },
+    { name = "flatbuffers", marker = "sys_platform == 'win32'" },
+    { name = "numpy", marker = "sys_platform == 'win32'" },
+    { name = "packaging", marker = "sys_platform == 'win32'" },
+    { name = "protobuf", marker = "sys_platform == 'win32'" },
+    { name = "sympy", marker = "sys_platform == 'win32'" },
+]
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/4a/93/aba75358133b3a941d736816dd392f687e7eab77215a6e429879080b76b6/onnxruntime-1.23.2-cp313-cp313-win_amd64.whl", hash = "sha256:1f9cc0a55349c584f083c1c076e611a7c35d5b867d5d6e6d6c823bf821978088", size = 13470276, upload-time = "2025-10-22T03:47:31.193Z" },
+]
+
 [[package]]
 name = "opencv-python"
-version = "4.12.0.88"
+version = "4.11.0.86"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", marker = "sys_platform == 'win32'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ac/71/25c98e634b6bdeca4727c7f6d6927b056080668c5008ad3c8fc9e7f8f6ec/opencv-python-4.12.0.88.tar.gz", hash = "sha256:8b738389cede219405f6f3880b851efa3415ccd674752219377353f017d2994d", size = 95373294, upload-time = "2025-07-07T09:20:52.389Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/17/06/68c27a523103dad5837dc5b87e71285280c4f098c60e4fe8a8db6486ab09/opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4", size = 95171956, upload-time = "2025-01-16T13:52:24.737Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/02/96/213fea371d3cb2f1d537612a105792aa0a6659fb2665b22cad709a75bd94/opencv_python-4.12.0.88-cp37-abi3-win32.whl", hash = "sha256:ff554d3f725b39878ac6a2e1fa232ec509c36130927afc18a1719ebf4fbf4357", size = 30284131, upload-time = "2025-07-07T09:14:08.819Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/80/eb88edc2e2b11cd2dd2e56f1c80b5784d11d6e6b7f04a1145df64df40065/opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl", hash = "sha256:d98edb20aa932fd8ebd276a72627dad9dc097695b3d435a4257557bbb49a79d2", size = 39000307, upload-time = "2025-07-07T09:14:16.641Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/d7/1d5941a9dde095468b288d989ff6539dd69cd429dbf1b9e839013d21b6f0/opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b", size = 29384337, upload-time = "2025-01-16T13:52:13.549Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044, upload-time = "2025-01-16T13:52:21.928Z" },
 ]
 
 [[package]]
@@ -671,28 +770,28 @@ wheels = [
 
 [[package]]
 name = "pillow"
-version = "12.1.0"
+version = "12.0.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d0/02/d52c733a2452ef1ffcc123b68e6606d07276b0e358db70eabad7e40042b7/pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9", size = 46977283, upload-time = "2026-01-02T09:13:29.892Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/cace85a1b0c9775a9f8f5d5423c8261c858760e2466c79b2dd184638b056/pillow-12.0.0.tar.gz", hash = "sha256:87d4f8125c9988bfbed67af47dd7a953e2fc7b0cc1e7800ec6d2080d490bb353", size = 47008828, upload-time = "2025-10-15T18:24:14.008Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/dd/c7/2530a4aa28248623e9d7f27316b42e27c32ec410f695929696f2e0e4a778/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:7b5dd7cbae20285cdb597b10eb5a2c13aa9de6cde9bb64a3c1317427b1db1ae1", size = 4062543, upload-time = "2026-01-02T09:11:31.566Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/1f/40b8eae823dc1519b87d53c30ed9ef085506b05281d313031755c1705f73/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:29a4cef9cb672363926f0470afc516dbf7305a14d8c54f7abbb5c199cd8f8179", size = 4138373, upload-time = "2026-01-02T09:11:33.367Z" },
-    { url = "https://files.pythonhosted.org/packages/d4/77/6fa60634cf06e52139fd0e89e5bbf055e8166c691c42fb162818b7fda31d/pillow-12.1.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:681088909d7e8fa9e31b9799aaa59ba5234c58e5e4f1951b4c4d1082a2e980e0", size = 3601241, upload-time = "2026-01-02T09:11:35.011Z" },
-    { url = "https://files.pythonhosted.org/packages/19/ce/c17334caea1db789163b5d855a5735e47995b0b5dc8745e9a3605d5f24c0/pillow-12.1.0-cp313-cp313-win32.whl", hash = "sha256:a786bf667724d84aa29b5db1c61b7bfdde380202aaca12c3461afd6b71743171", size = 6332551, upload-time = "2026-01-02T09:11:52.234Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/07/74a9d941fa45c90a0d9465098fe1ec85de3e2afbdc15cc4766622d516056/pillow-12.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:461f9dfdafa394c59cd6d818bdfdbab4028b83b02caadaff0ffd433faf4c9a7a", size = 7040087, upload-time = "2026-01-02T09:11:54.822Z" },
-    { url = "https://files.pythonhosted.org/packages/88/09/c99950c075a0e9053d8e880595926302575bc742b1b47fe1bbcc8d388d50/pillow-12.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:9212d6b86917a2300669511ed094a9406888362e085f2431a7da985a6b124f45", size = 2452470, upload-time = "2026-01-02T09:11:56.522Z" },
-    { url = "https://files.pythonhosted.org/packages/86/77/eacc62356b4cf81abe99ff9dbc7402750044aed02cfd6a503f7c6fc11f3e/pillow-12.1.0-cp313-cp313t-win32.whl", hash = "sha256:7315f9137087c4e0ee73a761b163fc9aa3b19f5f606a7fc08d83fd3e4379af65", size = 6336445, upload-time = "2026-01-02T09:12:14.775Z" },
-    { url = "https://files.pythonhosted.org/packages/e7/3c/57d81d0b74d218706dafccb87a87ea44262c43eef98eb3b164fd000e0491/pillow-12.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:0ddedfaa8b5f0b4ffbc2fa87b556dc59f6bb4ecb14a53b33f9189713ae8053c0", size = 7045354, upload-time = "2026-01-02T09:12:16.599Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/82/8b9b97bba2e3576a340f93b044a3a3a09841170ab4c1eb0d5c93469fd32f/pillow-12.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:80941e6d573197a0c28f394753de529bb436b1ca990ed6e765cf42426abc39f8", size = 2454547, upload-time = "2026-01-02T09:12:18.704Z" },
+    { url = "https://files.pythonhosted.org/packages/62/f2/de993bb2d21b33a98d031ecf6a978e4b61da207bef02f7b43093774c480d/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:0869154a2d0546545cde61d1789a6524319fc1897d9ee31218eae7a60ccc5643", size = 4045493, upload-time = "2025-10-15T18:22:25.758Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/b6/bc8d0c4c9f6f111a783d045310945deb769b806d7574764234ffd50bc5ea/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:a7921c5a6d31b3d756ec980f2f47c0cfdbce0fc48c22a39347a895f41f4a6ea4", size = 4120461, upload-time = "2025-10-15T18:22:27.286Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/57/d60d343709366a353dc56adb4ee1e7d8a2cc34e3fbc22905f4167cfec119/pillow-12.0.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1ee80a59f6ce048ae13cda1abf7fbd2a34ab9ee7d401c46be3ca685d1999a399", size = 3576912, upload-time = "2025-10-15T18:22:28.751Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/ca/16c6926cc1c015845745d5c16c9358e24282f1e588237a4c36d2b30f182f/pillow-12.0.0-cp313-cp313-win32.whl", hash = "sha256:4cc6b3b2efff105c6a1656cfe59da4fdde2cda9af1c5e0b58529b24525d0a098", size = 6302391, upload-time = "2025-10-15T18:22:44.753Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/2a/dd43dcfd6dae9b6a49ee28a8eedb98c7d5ff2de94a5d834565164667b97b/pillow-12.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:4cf7fed4b4580601c4345ceb5d4cbf5a980d030fd5ad07c4d2ec589f95f09905", size = 7007477, upload-time = "2025-10-15T18:22:46.838Z" },
+    { url = "https://files.pythonhosted.org/packages/77/f0/72ea067f4b5ae5ead653053212af05ce3705807906ba3f3e8f58ddf617e6/pillow-12.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:9f0b04c6b8584c2c193babcccc908b38ed29524b29dd464bc8801bf10d746a3a", size = 2435918, upload-time = "2025-10-15T18:22:48.399Z" },
+    { url = "https://files.pythonhosted.org/packages/61/2b/726235842220ca95fa441ddf55dd2382b52ab5b8d9c0596fe6b3f23dafe8/pillow-12.0.0-cp313-cp313t-win32.whl", hash = "sha256:4078242472387600b2ce8d93ade8899c12bf33fa89e55ec89fe126e9d6d5d9e9", size = 6306201, upload-time = "2025-10-15T18:23:04.709Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/3d/2afaf4e840b2df71344ababf2f8edd75a705ce500e5dc1e7227808312ae1/pillow-12.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2c54c1a783d6d60595d3514f0efe9b37c8808746a66920315bfd34a938d7994b", size = 7013165, upload-time = "2025-10-15T18:23:06.46Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/75/3fa09aa5cf6ed04bee3fa575798ddf1ce0bace8edb47249c798077a81f7f/pillow-12.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:26d9f7d2b604cd23aba3e9faf795787456ac25634d82cd060556998e39c6fa47", size = 2437834, upload-time = "2025-10-15T18:23:08.194Z" },
 ]
 
 [[package]]
 name = "platformdirs"
-version = "4.5.1"
+version = "4.5.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/cf/86/0248f086a84f01b37aaec0fa567b397df1a119f73c16f6c7a9aac73ea309/platformdirs-4.5.1.tar.gz", hash = "sha256:61d5cdcc6065745cdd94f0f878977f8de9437be93de97c1c12f853c9c0cdcbda", size = 21715, upload-time = "2025-12-05T13:52:58.638Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/61/33/9611380c2bdb1225fdef633e2a9610622310fed35ab11dac9620972ee088/platformdirs-4.5.0.tar.gz", hash = "sha256:70ddccdd7c99fc5942e9fc25636a8b34d04c24b335100223152c2803e4063312", size = 21632, upload-time = "2025-10-08T17:44:48.791Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl", hash = "sha256:d03afa3963c806a9bed9d5125c8f4cb2fdaf74a55ab60e5d59b3fde758104d31", size = 18731, upload-time = "2025-12-05T13:52:56.823Z" },
+    { url = "https://files.pythonhosted.org/packages/73/cb/ac7874b3e5d58441674fb70742e6c374b28b0c7cb988d37d991cde47166c/platformdirs-4.5.0-py3-none-any.whl", hash = "sha256:e578a81bb873cbb89a41fcc904c7ef523cc18284b7e3b3ccf06aca1403b7ebd3", size = 18651, upload-time = "2025-10-08T17:44:47.223Z" },
 ]
 
 [[package]]
@@ -711,16 +810,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/88/74/a88bf1b1efeae488a0c0b7bdf71429c313722d1fc0f377537fbe554e6180/pre_commit-4.2.0-py2.py3-none-any.whl", hash = "sha256:a009ca7205f1eb497d10b845e52c838a98b6cdd2102a6c8e4540e94ee75c58bd", size = 220707, upload-time = "2025-03-18T21:35:19.343Z" },
 ]
 
+[[package]]
+name = "protobuf"
+version = "6.33.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0a/03/a1440979a3f74f16cab3b75b0da1a1a7f922d56a8ddea96092391998edc0/protobuf-6.33.1.tar.gz", hash = "sha256:97f65757e8d09870de6fd973aeddb92f85435607235d20b2dfed93405d00c85b", size = 443432, upload-time = "2025-11-13T16:44:18.895Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/f1/446a9bbd2c60772ca36556bac8bfde40eceb28d9cc7838755bc41e001d8f/protobuf-6.33.1-cp310-abi3-win32.whl", hash = "sha256:f8d3fdbc966aaab1d05046d0240dd94d40f2a8c62856d41eaa141ff64a79de6b", size = 425593, upload-time = "2025-11-13T16:44:06.275Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/79/8780a378c650e3df849b73de8b13cf5412f521ca2ff9b78a45c247029440/protobuf-6.33.1-cp310-abi3-win_amd64.whl", hash = "sha256:923aa6d27a92bf44394f6abf7ea0500f38769d4b07f4be41cb52bd8b1123b9ed", size = 436883, upload-time = "2025-11-13T16:44:09.222Z" },
+    { url = "https://files.pythonhosted.org/packages/08/b4/46310463b4f6ceef310f8348786f3cff181cea671578e3d9743ba61a459e/protobuf-6.33.1-py3-none-any.whl", hash = "sha256:d595a9fd694fdeb061a62fbe10eb039cc1e444df81ec9bb70c7fc59ebcb1eafa", size = 170477, upload-time = "2025-11-13T16:44:17.633Z" },
+]
+
 [[package]]
 name = "psutil"
-version = "7.2.1"
+version = "7.1.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/73/cb/09e5184fb5fc0358d110fc3ca7f6b1d033800734d34cac10f4136cfac10e/psutil-7.2.1.tar.gz", hash = "sha256:f7583aec590485b43ca601dd9cea0dcd65bd7bb21d30ef4ddbf4ea6b5ed1bdd3", size = 490253, upload-time = "2025-12-29T08:26:00.169Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e1/88/bdd0a41e5857d5d703287598cbf08dad90aed56774ea52ae071bae9071b6/psutil-7.1.3.tar.gz", hash = "sha256:6c86281738d77335af7aec228328e944b30930899ea760ecf33a4dba66be5e74", size = 489059, upload-time = "2025-11-02T12:25:54.619Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fb/23/851cadc9764edcc18f0effe7d0bf69f727d4cf2442deb4a9f78d4e4f30f2/psutil-7.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:923f8653416604e356073e6e0bccbe7c09990acef442def2f5640dd0faa9689f", size = 139081, upload-time = "2025-12-29T08:26:12.483Z" },
-    { url = "https://files.pythonhosted.org/packages/59/82/d63e8494ec5758029f31c6cb06d7d161175d8281e91d011a4a441c8a43b5/psutil-7.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:cfbe6b40ca48019a51827f20d830887b3107a74a79b01ceb8cc8de4ccb17b672", size = 134767, upload-time = "2025-12-29T08:26:14.528Z" },
-    { url = "https://files.pythonhosted.org/packages/34/68/d9317542e3f2b180c4306e3f45d3c922d7e86d8ce39f941bb9e2e9d8599e/psutil-7.2.1-cp37-abi3-win_amd64.whl", hash = "sha256:b1b0671619343aa71c20ff9767eced0483e4fc9e1f489d50923738caf6a03c17", size = 136938, upload-time = "2025-12-29T08:26:41.036Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/73/2ce007f4198c80fcf2cb24c169884f833fe93fbc03d55d302627b094ee91/psutil-7.2.1-cp37-abi3-win_arm64.whl", hash = "sha256:0d67c1822c355aa6f7314d92018fb4268a76668a536f133599b91edd48759442", size = 133836, upload-time = "2025-12-29T08:26:43.086Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/82/62d68066e13e46a5116df187d319d1724b3f437ddd0f958756fc052677f4/psutil-7.1.3-cp313-cp313t-win_amd64.whl", hash = "sha256:18349c5c24b06ac5612c0428ec2a0331c26443d259e2a0144a9b24b4395b58fa", size = 249642, upload-time = "2025-11-02T12:26:07.447Z" },
+    { url = "https://files.pythonhosted.org/packages/df/ad/c1cd5fe965c14a0392112f68362cfceb5230819dbb5b1888950d18a11d9f/psutil-7.1.3-cp313-cp313t-win_arm64.whl", hash = "sha256:c525ffa774fe4496282fb0b1187725793de3e7c6b29e41562733cae9ada151ee", size = 245518, upload-time = "2025-11-02T12:26:09.719Z" },
+    { url = "https://files.pythonhosted.org/packages/55/4c/c3ed1a622b6ae2fd3c945a366e64eb35247a31e4db16cf5095e269e8eb3c/psutil-7.1.3-cp37-abi3-win_amd64.whl", hash = "sha256:f39c2c19fe824b47484b96f9692932248a54c43799a84282cfe58d05a6449efd", size = 247633, upload-time = "2025-11-02T12:26:33.887Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/ad/33b2ccec09bf96c2b2ef3f9a6f66baac8253d7565d8839e024a6b905d45d/psutil-7.1.3-cp37-abi3-win_arm64.whl", hash = "sha256:bd0d69cee829226a761e92f28140bec9a5ee9d5b4fb4b0cc589068dbfff559b1", size = 244608, upload-time = "2025-11-02T12:26:36.136Z" },
 ]
 
 [[package]]
@@ -820,6 +930,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/df/80/fc9d01d5ed37ba4c42ca2b55b4339ae6e200b456be3a1aaddf4a9fa99b8c/pyperclip-1.11.0-py3-none-any.whl", hash = "sha256:299403e9ff44581cb9ba2ffeed69c7aa96a008622ad0c46cb575ca75b5b84273", size = 11063, upload-time = "2025-09-26T14:40:36.069Z" },
 ]
 
+[[package]]
+name = "pyreadline3"
+version = "3.5.4"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" },
+]
+
 [[package]]
 name = "pyrect"
 version = "0.2.0"
@@ -1119,6 +1238,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" },
 ]
 
+[[package]]
+name = "sympy"
+version = "1.14.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "mpmath", marker = "sys_platform == 'win32'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
+]
+
 [[package]]
 name = "tomli"
 version = "2.3.0"
@@ -1165,11 +1296,11 @@ wheels = [
 
 [[package]]
 name = "urllib3"
-version = "2.6.3"
+version = "2.5.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
 ]
 
 [[package]]
@@ -1185,16 +1316,16 @@ wheels = [
 
 [[package]]
 name = "virtualenv"
-version = "20.36.0"
+version = "20.35.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "distlib", marker = "sys_platform == 'win32'" },
     { name = "filelock", marker = "sys_platform == 'win32'" },
     { name = "platformdirs", marker = "sys_platform == 'win32'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/78/49/87e23d8f742f10f965bce5d6b285fc88a4f436b11daf6b6225d4d66f8492/virtualenv-20.36.0.tar.gz", hash = "sha256:a3601f540b515a7983508113f14e78993841adc3d83710fa70f0ac50f43b23ed", size = 6032237, upload-time = "2026-01-07T17:20:04.975Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/20/28/e6f1a6f655d620846bd9df527390ecc26b3805a0c5989048c210e22c5ca9/virtualenv-20.35.4.tar.gz", hash = "sha256:643d3914d73d3eeb0c552cbb12d7e82adf0e504dbf86a3182f8771a153a1971c", size = 6028799, upload-time = "2025-10-29T06:57:40.511Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/eb/6a/0af36875e0023a1f2d0b66b4051721fc26740e947696922df1665b75e5d3/virtualenv-20.36.0-py3-none-any.whl", hash = "sha256:e7ded577f3af534fd0886d4ca03277f5542053bedb98a70a989d3c22cfa5c9ac", size = 6008261, upload-time = "2026-01-07T17:20:02.87Z" },
+    { url = "https://files.pythonhosted.org/packages/79/0c/c05523fa3181fdf0c9c52a6ba91a23fbf3246cc095f26f6516f9c60e6771/virtualenv-20.35.4-py3-none-any.whl", hash = "sha256:c21c9cede36c9753eeade68ba7d523529f228a403463376cf821eaae2b650f1b", size = 6005095, upload-time = "2025-10-29T06:57:37.598Z" },
 ]
 
 [[package]]

From ea5825b09f9e22bde8563fd94b9a2cca5948fdb1 Mon Sep 17 00:00:00 2001
From: Tianze <tianzeshi_study@outlook.com>
Date: Fri, 30 Jan 2026 09:00:52 +0800
Subject: [PATCH 2/3] replace the default model with Mozilla's distilvit
 (#19530)

Description of user facing changes:

replace the default model with Mozilla's distilvit
Description of developer facing changes:

None
Description of development approach:

None
---
 source/_localCaptioner/modelDownloader.py    | 2 +-
 source/config/configSpec.py                  | 2 +-
 source/gui/_localCaptioner/messageDialogs.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/_localCaptioner/modelDownloader.py b/source/_localCaptioner/modelDownloader.py
index 476b91dd926..1f725b95f87 100644
--- a/source/_localCaptioner/modelDownloader.py
+++ b/source/_localCaptioner/modelDownloader.py
@@ -660,7 +660,7 @@ def _waitForRetry(self, attempt: int, threadId: int) -> bool:
 	def downloadModelsMultithreaded(
 		self,
 		modelsDir: str = WritePaths.modelsDir,
-		modelName: str = "Xenova/vit-gpt2-image-captioning",
+		modelName: str = "Mozilla/distilvit",
 		filesToDownload: list[str] | None = None,
 		resolvePath: str = "/resolve/main",
 		progressCallback: ProgressCallback | None = None,
diff --git a/source/config/configSpec.py b/source/config/configSpec.py
index 3696b6fc169..9790082625a 100644
--- a/source/config/configSpec.py
+++ b/source/config/configSpec.py
@@ -531,7 +531,7 @@
 
 [automatedImageDescriptions]
 	enable = boolean(default=false)
-	defaultModel = string(default="Xenova/vit-gpt2-image-captioning")
+	defaultModel = string(default="Mozilla/distilvit")
 
 [screenCurtain]
 	enabled = boolean(default=false)
diff --git a/source/gui/_localCaptioner/messageDialogs.py b/source/gui/_localCaptioner/messageDialogs.py
index c7a3e7c32cd..27fa32d7b74 100644
--- a/source/gui/_localCaptioner/messageDialogs.py
+++ b/source/gui/_localCaptioner/messageDialogs.py
@@ -110,7 +110,7 @@ def openDownloadDialog(self) -> None:
 			message=pgettext(
 				"imageDesc",
 				# Translators: label of dialog when downloading image captioning
-				"Image captioning not installed. Would you like to install (235 MB)?",
+				"Image captioning not installed. Would you like to install (178 MB)?",
 			),
 			dialogType=DialogType.WARNING,
 			buttons=confirmationButtons,

From 6e78df165bbc3c0562f422a0a762adff7833a5f7 Mon Sep 17 00:00:00 2001
From: Tianze <tianzeshi_study@outlook.com>
Date: Fri, 30 Jan 2026 13:46:56 +0800
Subject: [PATCH 3/3] fix unit test for image description (#19535)

---
 tests/unit/test_localCaptioner/test_downloader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/test_localCaptioner/test_downloader.py b/tests/unit/test_localCaptioner/test_downloader.py
index f3022a8c2da..946f5322574 100644
--- a/tests/unit/test_localCaptioner/test_downloader.py
+++ b/tests/unit/test_localCaptioner/test_downloader.py
@@ -39,7 +39,7 @@ def test_ensureModelsDirectory_success(self, mockMkdir):
 		"""Ensure directory is created and correct path returned."""
 		mockMkdir.return_value = None
 		modelsDir = self.downloader.ensureModelsDirectory()
-		self.assertTrue(modelsDir.endswith("vit-gpt2-image-captioning"))
+		self.assertTrue(modelsDir.endswith("distilvit"))
 		mockMkdir.assert_called_once()
 
 	@patch("pathlib.Path.mkdir", side_effect=OSError("Permission denied"))