From eb00c18f5bd79b86c62d9b4fbf526e3f1036bc74 Mon Sep 17 00:00:00 2001 From: Sean Budd Date: Fri, 16 Jan 2026 17:27:43 +1100 Subject: [PATCH 1/3] Revert "Revert AI image description work (#19425)" This reverts commit 9f3aecbb095b9721189f443c300fce1db6f8accf. --- .github/workflows/testAndPublish.yml | 1 + pyproject.toml | 4 + source/NVDAState.py | 4 + source/_localCaptioner/__init__.py | 44 + source/_localCaptioner/captioner/__init__.py | 53 ++ source/_localCaptioner/captioner/base.py | 24 + source/_localCaptioner/captioner/vitGpt2.py | 382 +++++++++ source/_localCaptioner/imageDescriber.py | 214 +++++ source/_localCaptioner/modelConfig.py | 277 ++++++ source/_localCaptioner/modelDownloader.py | 760 +++++++++++++++++ source/config/__init__.py | 1 + source/config/configSpec.py | 24 +- source/core.py | 7 + source/globalCommands.py | 37 + source/gui/__init__.py | 5 + source/gui/_localCaptioner/__init__.py | 0 source/gui/_localCaptioner/messageDialogs.py | 198 +++++ source/gui/blockAction.py | 13 + source/gui/settingsDialogs.py | 49 ++ source/setup.py | 7 +- .../libraries/SystemTestSpy/configManager.py | 26 + .../libraries/SystemTestSpy/mockModels.py | 793 ++++++++++++++++++ .../standard-doLoadMockModel.ini | 20 + .../robot/automatedImageDescriptions.py | 43 + .../robot/automatedImageDescriptions.robot | 26 + tests/unit/test_localCaptioner/__init__.py | 0 .../test_localCaptioner/test_captioner.py | 345 ++++++++ .../test_localCaptioner/test_downloader.py | 108 +++ user_docs/en/changes.md | 8 + user_docs/en/userGuide.md | 41 +- uv.lock | 235 ++++-- 31 files changed, 3684 insertions(+), 65 deletions(-) create mode 100644 source/_localCaptioner/__init__.py create mode 100644 source/_localCaptioner/captioner/__init__.py create mode 100644 source/_localCaptioner/captioner/base.py create mode 100644 source/_localCaptioner/captioner/vitGpt2.py create mode 100644 source/_localCaptioner/imageDescriber.py create mode 100644 source/_localCaptioner/modelConfig.py create mode 100644 source/_localCaptioner/modelDownloader.py create mode 100644 source/gui/_localCaptioner/__init__.py create mode 100644 source/gui/_localCaptioner/messageDialogs.py create mode 100644 tests/system/libraries/SystemTestSpy/mockModels.py create mode 100644 tests/system/nvdaSettingsFiles/standard-doLoadMockModel.ini create mode 100644 tests/system/robot/automatedImageDescriptions.py create mode 100644 tests/system/robot/automatedImageDescriptions.robot create mode 100644 tests/unit/test_localCaptioner/__init__.py create mode 100644 tests/unit/test_localCaptioner/test_captioner.py create mode 100644 tests/unit/test_localCaptioner/test_downloader.py diff --git a/.github/workflows/testAndPublish.yml b/.github/workflows/testAndPublish.yml index f4990055655..928d75875f6 100644 --- a/.github/workflows/testAndPublish.yml +++ b/.github/workflows/testAndPublish.yml @@ -401,6 +401,7 @@ jobs: - startupShutdown - symbols - vscode + - imageDescriptions - chrome_annotations - chrome_list - chrome_table diff --git a/pyproject.toml b/pyproject.toml index 0e41fd3bc15..3fc86063d16 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,6 +48,9 @@ dependencies = [ "l2m4m==1.0.4", "pyyaml==6.0.3", "pymdown-extensions==10.17.1", + # local image caption + "onnxruntime==1.23.2", + "numpy==2.3.5", ] [project.urls] @@ -335,6 +338,7 @@ system-tests = [ "robotframework==7.3.2", "robotremoteserver==1.1.1", "robotframework-screencaplibrary==1.6.0", + "onnx==1.19.1", ] unit-tests = [ # Creating XML unit test reports diff --git a/source/NVDAState.py b/source/NVDAState.py index 6f6b079aab1..f08f227dd83 100644 --- a/source/NVDAState.py +++ b/source/NVDAState.py @@ -67,6 +67,10 @@ def voiceDictsBackupDir(self) -> str: def updatesDir(self) -> str: return os.path.join(self.configDir, "updates") + @property + def modelsDir(self) -> str: + return os.path.join(self.configDir, "models") + @property def nvdaConfigFile(self) -> str: return os.path.join(self.configDir, "nvda.ini") diff --git a/source/_localCaptioner/__init__.py b/source/_localCaptioner/__init__.py new file mode 100644 index 00000000000..3d55b5c486a --- /dev/null +++ b/source/_localCaptioner/__init__.py @@ -0,0 +1,44 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +from logHandler import log + +from .imageDescriber import ImageDescriber +from . import modelConfig + +_localCaptioner: ImageDescriber | None = None + + +def initialize(): + """Initialise the local captioner.""" + global _localCaptioner + log.debug("Initializing local captioner") + modelConfig.initialize() + _localCaptioner = ImageDescriber() + + +def terminate(): + """Terminate the local captioner.""" + global _localCaptioner + if _localCaptioner is None: + log.error("local captioner not running") + return + log.debug("Terminating local captioner") + _localCaptioner.terminate() + _localCaptioner = None + + +def isModelLoaded() -> bool: + """return if model is loaded""" + if _localCaptioner is not None: + return _localCaptioner.isModelLoaded + else: + return False + + +def toggleImageCaptioning() -> None: + """do load/unload the model from memory.""" + if _localCaptioner is not None: + _localCaptioner.toggleSwitch() diff --git a/source/_localCaptioner/captioner/__init__.py b/source/_localCaptioner/captioner/__init__.py new file mode 100644 index 00000000000..a1f16590a0c --- /dev/null +++ b/source/_localCaptioner/captioner/__init__.py @@ -0,0 +1,53 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import json + +from logHandler import log +from .base import ImageCaptioner + + +def imageCaptionerFactory( + configPath: str, + encoderPath: str | None = None, + decoderPath: str | None = None, + monomericModelPath: str | None = None, +) -> ImageCaptioner: + """Initialize the image caption generator. + + :param monomericModelPath: Path to a single merged model file. + :param encoderPath: Path to the encoder model file. + :param decoderPath: Path to the decoder model file. + :param configPath: Path to the configuration file. + :raises ValueError: If neither a single model nor both encoder and decoder are provided. + :raises FileNotFoundError: If config file not found. + :raises NotImplementedError: if model architecture is unsupported + :raises Exception: If config.json fail to load. + :return: instance of ImageCaptioner + """ + if not monomericModelPath and not (encoderPath and decoderPath): + raise ValueError( + "You must provide either 'monomericModelPath' or both 'encoderPath' and 'decoderPath'.", + ) + + try: + with open(configPath, "r", encoding="utf-8") as f: + config = json.load(f) + except FileNotFoundError: + raise FileNotFoundError( + f"Caption model config file {configPath} not found, " + "please download models and config file first!", + ) + except Exception: + log.exception("config file not found") + raise + + modelArchitecture = config["architectures"][0] + if modelArchitecture == "VisionEncoderDecoderModel": + from .vitGpt2 import VitGpt2ImageCaptioner + + return VitGpt2ImageCaptioner(encoderPath, decoderPath, configPath) + else: + raise NotImplementedError("Unsupported model architectures") diff --git a/source/_localCaptioner/captioner/base.py b/source/_localCaptioner/captioner/base.py new file mode 100644 index 00000000000..ba7ea116f79 --- /dev/null +++ b/source/_localCaptioner/captioner/base.py @@ -0,0 +1,24 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +from abc import ABC, abstractmethod + + +class ImageCaptioner(ABC): + """Abstract interface for image caption generation. + + Supports generate caption for image + """ + + @abstractmethod + def generateCaption(self, image: str | bytes, maxLength: int | None = None) -> str: + """ + Generate a caption for the given image. + + :param image: Image file path or binary data. + :param maxLength: Optional maximum length for the generated caption. + :return: The generated image caption as a string. + """ + pass diff --git a/source/_localCaptioner/captioner/vitGpt2.py b/source/_localCaptioner/captioner/vitGpt2.py new file mode 100644 index 00000000000..47af56c9266 --- /dev/null +++ b/source/_localCaptioner/captioner/vitGpt2.py @@ -0,0 +1,382 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +import os +import json +import re +import io +from functools import lru_cache + +import numpy as np +from PIL import Image + +from logHandler import log + +from .base import ImageCaptioner +from ..modelConfig import ( + _EncoderConfig, + _DecoderConfig, + _GenerationConfig, + _ModelConfig, + _PreprocessorConfig, + _createConfigFromDict, +) +from .. import modelConfig + + +class VitGpt2ImageCaptioner(ImageCaptioner): + """Lightweight ONNX Runtime image captioning model. + + This class provides image captioning functionality using ONNX models + without PyTorch dependencies. It uses a Vision Transformer encoder + and GPT-2 decoder for generating captions. + """ + + def __init__( + self, + encoderPath: str, + decoderPath: str, + configPath: str, + enableProfiling: bool = False, + ) -> None: + """Initialize the lightweight ONNX image captioning model. + + :param encoderPath: Path to the ViT encoder ONNX model. + :param decoderPath: Path to the GPT-2 decoder ONNX model. + :param configPath: Path to the configuration file (required). + :param enableProfiling: Whether to enable ONNX Runtime profiling. + :raises FileNotFoundError: If config file is not found. + :raises Exception: If model initialization fails. + """ + # Import late to avoid importing numpy at initialization + import onnxruntime as ort + + # Load configuration file + try: + with open(configPath, "r", encoding="utf-8") as f: + self.config = json.load(f) + except FileNotFoundError: + raise FileNotFoundError( + f"Caption model config file {configPath} not found, " + "please download models and config file first!", + ) + except Exception: + raise + + # Load vocabulary from vocab.json in the same directory as config + configDir = os.path.dirname(configPath) + vocabPath = os.path.join(configDir, "vocab.json") + self.vocab = self._loadVocab(vocabPath) + self.vocabSize = len(self.vocab) + + preprocessorPath = os.path.join(configDir, "preprocessor_config.json") + self.preprocessorConfig = self._loadPreprocessorConfig(preprocessorPath) + + # Load all model parameters from configuration + self._loadModelParams() + + # Configure ONNX Runtime session + sessionOptions = ort.SessionOptions() + if enableProfiling: + sessionOptions.enable_profiling = True + + # Load ONNX models + try: + self.encoderSession = ort.InferenceSession(encoderPath, sess_options=sessionOptions) + self.decoderSession = ort.InferenceSession(decoderPath, sess_options=sessionOptions) + except ( + ort.capi.onnxruntime_pybind11_state.InvalidProtobuf, + ort.capi.onnxruntime_pybind11_state.NoSuchFile, + ) as e: + raise FileNotFoundError( + "model file incomplete" + f" Please check whether the file is complete or re-download. Original error: {e}", + ) from e + + log.debug( + f"Loaded ONNX models - Encoder: {os.path.basename(encoderPath)}, Decoder: {os.path.basename(decoderPath)}", + ) + log.debug(f"Loaded config : {os.path.basename(configPath)}") + log.debug(f"Loaded vocabulary : {os.path.basename(vocabPath)}") + log.debug( + f"Model config - Image size: {self.encoderConfig.image_size}, Max length: {self.decoderConfig.max_length}", + ) + + def _loadModelParams(self) -> None: + """Load all model parameters from configuration file.""" + # Load encoder configuration + encoder_dict = self.config.get("encoder", {}) + self.encoderConfig = _createConfigFromDict( + _EncoderConfig, + encoder_dict, + modelConfig._DEFAULT_ENCODER_CONFIG, + ) + + # Load decoder configuration + decoder_dict = self.config.get("decoder", {}) + self.decoderConfig = _createConfigFromDict( + _DecoderConfig, + decoder_dict, + modelConfig._DEFAULT_DECODER_CONFIG, + ) + + # Load generation configuration + generation_dict = self.config.get("generation", {}) + self.generationConfig = _createConfigFromDict( + _GenerationConfig, + generation_dict, + modelConfig._DEFAULT_GENERATION_CONFIG, + ) + + # Load main model configuration + self.modelConfig = _createConfigFromDict(_ModelConfig, self.config, modelConfig._DEFAULT_MODEL_CONFIG) + + def _loadVocab(self, vocabPath: str) -> dict[int, str]: + """Load vocabulary file. + + :param vocabPath: Path to vocab.json file. + :return: Dictionary mapping token IDs to tokens. + """ + try: + with open(vocabPath, "r", encoding="utf-8") as f: + vocabData = json.load(f) + + # Convert to id -> token format + vocab = {v: k for k, v in vocabData.items()} + log.debug(f"Successfully loaded vocabulary with {len(vocab)} tokens") + return vocab + + except FileNotFoundError: + log.exception(f"vocab.json not found at {vocabPath}") + raise + except Exception: + log.exception(f"Could not load vocabulary from {vocabPath}") + raise + + def _loadPreprocessorConfig(self, preprocessorPath: str) -> _PreprocessorConfig: + """Load preprocessor configuration from preprocessor_config.json.""" + try: + with open(preprocessorPath, "r", encoding="utf-8") as f: + preprocessor_dict = json.load(f) + except FileNotFoundError: + log.warning("Preprocessor config not found, using defaults") + return modelConfig._DEFAULT_PREPROCESSOR_CONFIG + else: + return _createConfigFromDict( + _PreprocessorConfig, + preprocessor_dict, + modelConfig._DEFAULT_PREPROCESSOR_CONFIG, + ) + + def _preprocessImage(self, image: str | bytes) -> np.ndarray: + """Preprocess image for model input using external configuration. + + :param image: Image file path or binary data. + :return: Preprocessed image array ready for model input. + """ + # Load image + if isinstance(image, str) and os.path.isfile(image): + img = Image.open(image).convert("RGB") + else: + img = Image.open(io.BytesIO(image)).convert("RGB") + + # Resize image if configured + if self.preprocessorConfig.do_resize: + target_size = ( + self.preprocessorConfig.size["width"], + self.preprocessorConfig.size["height"], + ) + # Map resample integer to PIL constant + resample_map = { + 0: Image.NEAREST, + 1: Image.LANCZOS, + 2: Image.BILINEAR, + 3: Image.BICUBIC, + 4: Image.BOX, + 5: Image.HAMMING, + } + resample_method = resample_map.get(self.preprocessorConfig.resample, Image.LANCZOS) + img = img.resize(target_size, resample_method) + + # Convert to numpy array + imgArray = np.array(img).astype(np.float32) + + # Rescale if configured (typically from [0, 255] to [0, 1]) + if self.preprocessorConfig.do_rescale: + imgArray = imgArray * self.preprocessorConfig.rescale_factor + + # Normalize if configured + if self.preprocessorConfig.do_normalize: + mean = np.array(self.preprocessorConfig.image_mean, dtype=np.float32) + std = np.array(self.preprocessorConfig.image_std, dtype=np.float32) + imgArray = (imgArray - mean) / std + + # Adjust dimensions: (H, W, C) -> (1, C, H, W) + imgArray = np.transpose(imgArray, (2, 0, 1)) + imgArray = np.expand_dims(imgArray, axis=0) + + return imgArray + + def _encodeImage(self, imageArray: np.ndarray) -> np.ndarray: + """Encode image using ViT encoder. + + :param imageArray: Preprocessed image array. + :return: Encoder hidden states. + """ + # Get encoder input name + inputName = self.encoderSession.get_inputs()[0].name + + # Run encoder inference + imageArray = imageArray.astype(np.float32) + encoderOutputs = self.encoderSession.run(None, {inputName: imageArray}) + + # Return last hidden state + return encoderOutputs[0] + + def _decodeTokens(self, tokenIds: list[int]) -> str: + """Decode token IDs to text. + + :param tokenIds: List of token IDs. + :return: Decoded text string. + """ + tokens = [] + for tokenId in tokenIds: + if tokenId in self.vocab: + token = self.vocab[tokenId] + if token not in ["<|endoftext|>", "<|pad|>"]: + tokens.append(token) + + # Simple text post-processing + # Ġ (Unicode U+0120) is used by GPT-2 and RoBERTa to indicate space at the beginning of a word in their vocabulary + text = " ".join(tokens).replace("Ġ", " ") + + # Basic text cleaning + text = re.sub(r"\s+", " ", text) # Merge multiple spaces + text = text.strip() + + return text + + def _getDecoderInputNames(self) -> list[str]: + """Get decoder input names for debugging. + + :returns: List of decoder input names. + """ + return [inp.name for inp in self.decoderSession.get_inputs()] + + def _getDecoderOutputNames(self) -> list[str]: + """Get decoder output names for debugging. + + :return: List of decoder output names. + """ + return [out.name for out in self.decoderSession.get_outputs()] + + def _initializePastKeyValues(self, batchSize: int = 1) -> dict[str, np.ndarray]: + """Initialize past_key_values for decoder. + + :param batchSize: Batch size for inference. + :return: Dictionary of initialized past key values. + """ + pastKeyValues = {} + + # Create key and value for each layer + for layerIdx in range(self.decoderConfig.n_layer): + # Key and value shape: (batch_size, num_heads, 0, head_dim) + # Initial sequence length is 0 + headDim = self.decoderConfig.n_embd // self.decoderConfig.n_head + + keyShape = (batchSize, self.decoderConfig.n_head, 0, headDim) + valueShape = (batchSize, self.decoderConfig.n_head, 0, headDim) + + pastKeyValues[f"past_key_values.{layerIdx}.key"] = np.zeros(keyShape, dtype=np.float32) + pastKeyValues[f"past_key_values.{layerIdx}.value"] = np.zeros(valueShape, dtype=np.float32) + + return pastKeyValues + + def _generateWithGreedy( + self, + encoderHiddenStates: np.ndarray, + maxLength: int | None = None, + ) -> str: + """Generate text using greedy search. + + + :param encoderHiddenStates: Encoder hidden states. + :param maxLength: Maximum generation length. + :return: Generated text string. + """ + if maxLength is None: + maxLength = self.decoderConfig.max_length + + # Initialize input sequence + inputIds = np.array([[self.modelConfig.bos_token_id]], dtype=np.int64) + generatedTokens = [] + + # Initialize past_key_values + pastKeyValues = self._initializePastKeyValues(batchSize=1) + + for step in range(maxLength): + # Prepare decoder inputs + decoderInputs = { + "input_ids": inputIds if step == 0 else np.array([[generatedTokens[-1]]], dtype=np.int64), + "encoder_hidden_states": encoderHiddenStates, + "use_cache_branch": np.array([1], dtype=np.bool_), + } + + # Add past_key_values to inputs + decoderInputs.update(pastKeyValues) + + # Run decoder + decoderOutputs = self.decoderSession.run(None, decoderInputs) + logits = decoderOutputs[0] # Shape: (batch_size, seq_len, vocab_size) + + # Greedy selection of next token + nextTokenLogits = logits[0, -1, :] # Logits for last position + nextTokenId = int(np.argmax(nextTokenLogits)) + + # Check if generation should end + if nextTokenId == self.modelConfig.eos_token_id: + break + + generatedTokens.append(nextTokenId) + + # Update past_key_values from outputs + if len(decoderOutputs) > 1: + for layerIdx in range(self.decoderConfig.n_layer): + if len(decoderOutputs) > 1 + layerIdx * 2 + 1: + # [3] -> layer1 key, [4] -> layer1 value + keyIndex = 1 + layerIdx * 2 + valueIndex = keyIndex + 1 + pastKeyValues[f"past_key_values.{layerIdx}.key"] = decoderOutputs[keyIndex] + pastKeyValues[f"past_key_values.{layerIdx}.value"] = decoderOutputs[valueIndex] + + # Avoid sequences that are too long + if len(generatedTokens) >= self.decoderConfig.n_ctx - 1: + break + + # Decode generated text + return self._decodeTokens(generatedTokens) + + @lru_cache() + def generateCaption( + self, + image: str | bytes, + maxLength: int | None = None, + ) -> str: + """Generate image caption. + + :param image: Image file path or binary data. + :param maxLength: Maximum generation length. + :return: Generated image caption. + """ + # Preprocess image + imageArray = self._preprocessImage(image) + + # Encode image + encoderHiddenStates = self._encodeImage(imageArray) + + # Generate text + caption = self._generateWithGreedy(encoderHiddenStates, maxLength) + + return caption diff --git a/source/_localCaptioner/imageDescriber.py b/source/_localCaptioner/imageDescriber.py new file mode 100644 index 00000000000..1e193789ebf --- /dev/null +++ b/source/_localCaptioner/imageDescriber.py @@ -0,0 +1,214 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +"""ImageDescriber module for NVDA. + +This module provides local image captioning functionality using ONNX models. +It allows users to capture screen regions and generate captions using local AI models. +""" + +import io +import threading +from threading import Thread +import os + +import wx +import config +from logHandler import log +import ui +import api +from keyboardHandler import KeyboardInputGesture +from NVDAState import WritePaths +import core + +from .captioner import ImageCaptioner +from .captioner import imageCaptionerFactory + + +# Module-level configuration +_localCaptioner = None + + +def _screenshotNavigator() -> bytes: + """Capture a screenshot of the current navigator object. + + :Return: The captured image data as bytes in JPEG format. + """ + # Get the currently focused object on screen + obj = api.getNavigatorObject() + + # Get the object's position and size information + x, y, width, height = obj.location + + # Create a bitmap with the same size as the object + bmp = wx.Bitmap(width, height) + + # Create a memory device context for drawing operations on the bitmap + mem = wx.MemoryDC(bmp) + + # Copy the specified screen region to the memory bitmap + mem.Blit(0, 0, width, height, wx.ScreenDC(), x, y) + + # Convert the bitmap to an image object for more flexible operations + image = bmp.ConvertToImage() + + # Create a byte stream object to save image data as binary data + body = io.BytesIO() + + # Save the image to the byte stream in JPEG format + image.SaveFile(body, wx.BITMAP_TYPE_JPEG) + + # Read the binary image data from the byte stream + imageData = body.getvalue() + return imageData + + +def _messageCaption(captioner: ImageCaptioner, imageData: bytes) -> None: + """Generate a caption for the given image data. + + :param captioner: The captioner instance to use for generation. + :param imageData: The image data to caption. + """ + try: + description = captioner.generateCaption(image=imageData) + except Exception: + # Translators: error message when an image description cannot be generated + wx.CallAfter(ui.message, pgettext("imageDesc", "Failed to generate description")) + log.exception("Failed to generate caption") + else: + wx.CallAfter( + ui.message, + # Translators: Presented when an AI image description has been generated. + # {description} will be replaced with the generated image description. + pgettext("imageDesc", "Could be: {description}").format(description=description), + ) + + +class ImageDescriber: + """module for local image caption functionality. + + This module provides image captioning using local ONNX models. + It can capture screen regions and generate descriptive captions. + """ + + def __init__(self) -> None: + self.isModelLoaded = False + self.captioner: ImageCaptioner | None = None + self.captionThread: Thread | None = None + self.loadModelThread: Thread | None = None + + enable = config.conf["automatedImageDescriptions"]["enable"] + # Load model when initializing (may cause high memory usage) + if enable: + core.postNvdaStartup.register(self.loadModelInBackground) + + def terminate(self): + for t in [self.captionThread, self.loadModelThread]: + if t is not None and t.is_alive(): + t.join() + + self.captioner = None + + def runCaption(self, gesture: KeyboardInputGesture) -> None: + """Script to run image captioning on the current navigator object. + + :param gesture: The input gesture that triggered this script. + """ + self._doCaption() + + def _doCaption(self) -> None: + """Real logic to run image captioning on the current navigator object.""" + imageData = _screenshotNavigator() + + if not self.isModelLoaded: + from gui._localCaptioner.messageDialogs import openEnableOnceDialog + + # Ask to enable image desc only in this session, No configuration modifications + wx.CallAfter(openEnableOnceDialog) + return + + if self.captionThread is not None and self.captionThread.is_alive(): + return + + self.captionThread = threading.Thread( + target=_messageCaption, + args=(self.captioner, imageData), + name="RunCaptionThread", + ) + # Translators: Message when starting image recognition + ui.message(pgettext("imageDesc", "getting image description...")) + self.captionThread.start() + + def _loadModel(self, localModelDirPath: str | None = None) -> None: + """Load the ONNX model for image captioning. + + :param localModelDirPath: path of model directory + """ + + if not localModelDirPath: + baseModelsDir = WritePaths.modelsDir + localModelDirPath = os.path.join( + baseModelsDir, + config.conf["automatedImageDescriptions"]["defaultModel"], + ) + encoderPath = f"{localModelDirPath}/onnx/encoder_model_quantized.onnx" + decoderPath = f"{localModelDirPath}/onnx/decoder_model_merged_quantized.onnx" + configPath = f"{localModelDirPath}/config.json" + + try: + self.captioner = imageCaptionerFactory( + encoderPath=encoderPath, + decoderPath=decoderPath, + configPath=configPath, + ) + except FileNotFoundError: + self.isModelLoaded = False + from gui._localCaptioner.messageDialogs import ImageDescDownloader + + descDownloader = ImageDescDownloader() + wx.CallAfter(descDownloader.openDownloadDialog) + except Exception: + self.isModelLoaded = False + # Translators: error message when fail to load model + wx.CallAfter(ui.message, pgettext("imageDesc", "failed to load image captioner")) + log.exception("Failed to load image captioner model") + else: + self.isModelLoaded = True + # Translators: Message when successfully load the model + wx.CallAfter(ui.message, pgettext("imageDesc", "image captioning on")) + + def loadModelInBackground(self, localModelDirPath: str | None = None) -> None: + """load model in child thread + + :param localModelDirPath: path of model directory + """ + self.loadModelThread = threading.Thread( + target=self._loadModel, + args=(localModelDirPath,), + name="LoadModelThread", + ) + self.loadModelThread.start() + + def _doReleaseModel(self) -> None: + if hasattr(self, "captioner") and self.captioner: + del self.captioner + self.captioner = None + # Translators: Message when image captioning terminates + ui.message(pgettext("imageDesc", "image captioning off")) + self.isModelLoaded = False + + def toggleSwitch(self) -> None: + """do load/unload the model from memory.""" + if self.isModelLoaded: + self._doReleaseModel() + else: + self.loadModelInBackground() + + def toggleImageCaptioning(self, gesture: KeyboardInputGesture) -> None: + """do load/unload the model from memory. + + :param gesture: gesture to toggle this function + """ + self.toggleSwitch() diff --git a/source/_localCaptioner/modelConfig.py b/source/_localCaptioner/modelConfig.py new file mode 100644 index 00000000000..f5b705e482a --- /dev/null +++ b/source/_localCaptioner/modelConfig.py @@ -0,0 +1,277 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +from dataclasses import dataclass, fields, replace +from typing import Type + + +@dataclass(frozen=True) +class _EncoderConfig: + """Configuration for Vision Transformer encoder. + + Based on the Vision Transformer (ViT) specification: + https://arxiv.org/abs/2010.11929 + + HuggingFace ViT configuration: + https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTConfig + + Note: Variable names follow the original specification and HuggingFace conventions + rather than lowerCamelCase to maintain compatibility with pretrained models. + """ + + image_size: int = 224 + num_channels: int = 3 + patch_size: int = 16 + hidden_size: int = 768 + num_hidden_layers: int = 12 + num_attention_heads: int = 12 + intermediate_size: int = 3072 + hidden_act: str = "gelu" + hidden_dropout_prob: float = 0.0 + attention_probs_dropout_prob: float = 0.0 + initializer_range: float = 0.02 + layer_norm_eps: float = 1e-12 + encoder_stride: int = 16 + qkv_bias: bool = True + model_type: str = "vit" + # Additional fields from HuggingFace config + add_cross_attention: bool = False + is_decoder: bool = False + is_encoder_decoder: bool = False + chunk_size_feed_forward: int = 0 + cross_attention_hidden_size: int | None = None + finetuning_task: str | None = None + output_attentions: bool = False + output_hidden_states: bool = False + return_dict: bool = True + pruned_heads: dict[str, list[int]] | None = None + tie_word_embeddings: bool = True + torch_dtype: str | None = None + torchscript: bool = False + use_bfloat16: bool = False + + +@dataclass(frozen=True) +class _DecoderConfig: + """Configuration for GPT-2 decoder. + + Based on the GPT-2 specification: + https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf + + HuggingFace GPT-2 configuration: + https://huggingface.co/docs/transformers/model_doc/gpt2#transformers.GPT2Config + + Note: Variable names follow the original GPT-2 and HuggingFace conventions + rather than lowerCamelCase to maintain compatibility with pretrained models. + """ + + vocab_size: int = 50257 + n_embd: int = 768 + n_layer: int = 12 + n_head: int = 12 + n_ctx: int = 1024 + n_positions: int = 1024 + n_inner: int | None = None + activation_function: str = "gelu_new" + resid_pdrop: float = 0.1 + embd_pdrop: float = 0.1 + attn_pdrop: float = 0.1 + layer_norm_epsilon: float = 1e-05 + initializer_range: float = 0.02 + model_type: str = "gpt2" + # Generation parameters + max_length: int = 20 + min_length: int = 0 + do_sample: bool = False + early_stopping: bool = False + num_beams: int = 1 + num_beam_groups: int = 1 + diversity_penalty: float = 0.0 + temperature: float = 1.0 + top_k: int = 50 + top_p: float = 1.0 + typical_p: float = 1.0 + repetition_penalty: float = 1.0 + length_penalty: float = 1.0 + no_repeat_ngram_size: int = 0 + encoder_no_repeat_ngram_size: int = 0 + num_return_sequences: int = 1 + # Cross attention + add_cross_attention: bool = True + is_decoder: bool = True + is_encoder_decoder: bool = False + # Token IDs + bos_token_id: int = 50256 + eos_token_id: int = 50256 + pad_token_id: int = 50256 + decoder_start_token_id: int = 50256 + # Additional configuration + chunk_size_feed_forward: int = 0 + cross_attention_hidden_size: int | None = None + bad_words_ids: list[int] | None = None + begin_suppress_tokens: list[int] | None = None + forced_bos_token_id: int | None = None + forced_eos_token_id: int | None = None + suppress_tokens: list[int] | None = None + exponential_decay_length_penalty: float | None = None + remove_invalid_values: bool = False + return_dict_in_generate: bool = False + output_attentions: bool = False + output_hidden_states: bool = False + output_scores: bool = False + use_cache: bool = True + # Labels + id2label: dict[str, str] | None = None + label2id: dict[str, int] | None = None + # Scaling and attention + reorder_and_upcast_attn: bool = False + scale_attn_by_inverse_layer_idx: bool = False + scale_attn_weights: bool = True + # Summary configuration + summary_activation: str | None = None + summary_first_dropout: float = 0.1 + summary_proj_to_labels: bool = True + summary_type: str = "cls_index" + summary_use_proj: bool = True + # Task specific parameters + task_specific_params: dict[str, any] | None = None + # Other configurations + finetuning_task: str | None = None + prefix: str | None = None + problem_type: str | None = None + pruned_heads: dict[str, list[int]] | None = None + sep_token_id: int | None = None + tf_legacy_loss: bool = False + tie_encoder_decoder: bool = False + tie_word_embeddings: bool = True + tokenizer_class: str | None = None + torch_dtype: str | None = None + torchscript: bool = False + use_bfloat16: bool = False + + +@dataclass(frozen=True) +class _GenerationConfig: + """Configuration for text generation parameters. + + Based on HuggingFace GenerationConfig: + https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig + + Note: Variable names follow HuggingFace conventions rather than lowerCamelCase + to maintain compatibility with the transformers library. + """ + + do_sample: bool = False + num_beams: int = 1 + temperature: float = 1.0 + top_k: int = 50 + top_p: float = 1.0 + repetition_penalty: float = 1.0 + length_penalty: float = 1.0 + max_length: int = 20 + min_length: int = 0 + early_stopping: bool = False + diversity_penalty: float = 0.0 + num_beam_groups: int = 1 + no_repeat_ngram_size: int = 0 + num_return_sequences: int = 1 + + +@dataclass(frozen=True) +class _ModelConfig: + """Main model configuration. + + Based on HuggingFace VisionEncoderDecoderConfig: + https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder#transformers.VisionEncoderDecoderConfig + + Note: Variable names follow HuggingFace conventions rather than lowerCamelCase + to maintain compatibility with pretrained models. + """ + + model_type: str = "vision-encoder-decoder" + is_encoder_decoder: bool = True + tie_word_embeddings: bool = False + bos_token_id: int = 50256 + eos_token_id: int = 50256 + pad_token_id: int = 50256 + decoder_start_token_id: int = 50256 + transformers_version: str = "4.33.0.dev0" + architectures: list[str] | None = None + + +@dataclass(frozen=True) +class _PreprocessorConfig: + """Configuration for image preprocessing. + + Based on HuggingFace ViTFeatureExtractor/ViTImageProcessor: + https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTFeatureExtractor + https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTImageProcessor + + Note: Variable names follow HuggingFace conventions rather than lowerCamelCase + to maintain compatibility with the transformers library. + """ + + do_normalize: bool = True + do_rescale: bool = True + do_resize: bool = True + feature_extractor_type: str = "ViTFeatureExtractor" + image_processor_type: str = "ViTFeatureExtractor" + image_mean: list[float] | None = None + image_std: list[float] | None = None + resample: int = 2 # PIL.Image.LANCZOS + rescale_factor: float = 0.00392156862745098 # 1/255 + size: dict[str, int] | None = None + + def __post_init__(self): + """Initialize default values for mutable fields.""" + if self.image_mean is None: + object.__setattr__(self, "image_mean", [0.5, 0.5, 0.5]) + if self.image_std is None: + object.__setattr__(self, "image_std", [0.5, 0.5, 0.5]) + if self.size is None: + object.__setattr__(self, "size", {"height": 224, "width": 224}) + + +# Default configuration instances +_DEFAULT_ENCODER_CONFIG: _EncoderConfig | None = None +_DEFAULT_DECODER_CONFIG: _DecoderConfig | None = None +_DEFAULT_GENERATION_CONFIG: _GenerationConfig | None = None +_DEFAULT_MODEL_CONFIG: _ModelConfig | None = None +_DEFAULT_PREPROCESSOR_CONFIG: _PreprocessorConfig | None = None + + +def initialize(): + global \ + _DEFAULT_ENCODER_CONFIG, \ + _DEFAULT_DECODER_CONFIG, \ + _DEFAULT_GENERATION_CONFIG, \ + _DEFAULT_MODEL_CONFIG, \ + _DEFAULT_PREPROCESSOR_CONFIG + _DEFAULT_ENCODER_CONFIG = _EncoderConfig() + _DEFAULT_DECODER_CONFIG = _DecoderConfig() + _DEFAULT_GENERATION_CONFIG = _GenerationConfig() + _DEFAULT_MODEL_CONFIG = _ModelConfig() + _DEFAULT_PREPROCESSOR_CONFIG = _PreprocessorConfig() + + +def _createConfigFromDict[T]( + configClass: Type[T], + configdict: dict[str, str | int | float | bool | list | dict | None], + defaultConfig: T, +) -> T: + """Create a dataclass instance from a dictionary with automatic field mapping. + + :param configClass: The dataclass type to create + :param configdict: dictionary containing configuration values + :param defaultConfig: Default configuration instance + :return: New dataclass instance with values from configdict or defaults + """ + # Get all field names from the dataclass + fieldNames = {f.name for f in fields(configClass)} + + # Filter configdict to only include valid field names + validUpdates = {fieldName: value for fieldName, value in configdict.items() if fieldName in fieldNames} + + return replace(defaultConfig, **validUpdates) diff --git a/source/_localCaptioner/modelDownloader.py b/source/_localCaptioner/modelDownloader.py new file mode 100644 index 00000000000..476b91dd926 --- /dev/null +++ b/source/_localCaptioner/modelDownloader.py @@ -0,0 +1,760 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +""" +Multi‑threaded model downloader + +Download ONNX / tokenizer assets from *Hugging Face* (or any HTTP host) +with progress callbacks. Refactored to use requests library. +""" + +import os +import threading +import time +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from collections.abc import Callable + +import requests +from requests.adapters import HTTPAdapter +from requests.exceptions import RequestException +from requests.models import Response +from urllib3.util.retry import Retry + +from logHandler import log +import config +from NVDAState import WritePaths + +# Type definitions +ProgressCallback = Callable[[str, int, int, float], None] + +# Constants +CHUNK_SIZE: int = 8_192 +MAX_RETRIES: int = 3 +BACKOFF_BASE: int = 2 # Base delay (in seconds) for exponential backoff strategy + + +class ModelDownloader: + """Multi-threaded model downloader with progress tracking and retry logic.""" + + def __init__( + self, + remoteHost: str = "huggingface.co", + maxWorkers: int = 4, + maxRetries: int = MAX_RETRIES, + ): + """ + Initialize the ModelDownloader. + + :param remoteHost: Remote host URL (default: huggingface.co). + :param maxWorkers: Maximum number of worker threads. + :param maxRetries: Maximum retry attempts per file. + """ + self.remoteHost = remoteHost + self.maxWorkers = maxWorkers + self.maxRetries = maxRetries + + # Thread control + self.cancelRequested = False + self.downloadLock = threading.Lock() + self.activeFutures = set() + + # Configure requests session with retry strategy and automatic redirects + self.session = requests.Session() + + # Configure retry strategy + retryStrategy = Retry( + # Maximum number of retries before giving up + total=maxRetries, + # Base factor for calculating delay between retries + backoff_factor=BACKOFF_BASE, + # HTTP status codes that trigger a retry + status_forcelist=[429, 500, 502, 503, 504], + # HTTP methods allowed to retry + allowed_methods=["HEAD", "GET", "OPTIONS"], + ) + + adapter = HTTPAdapter(max_retries=retryStrategy) + self.session.mount("https://", adapter) + + def requestCancel(self) -> None: + """Request cancellation of all active downloads.""" + log.debug("Cancellation requested") + self.cancelRequested = True + + # Cancel all active futures + with self.downloadLock: + for future in self.activeFutures: + if not future.done(): + future.cancel() + self.activeFutures.clear() + + def resetCancellation(self) -> None: + """Reset cancellation state for new download session.""" + with self.downloadLock: + self.cancelRequested = False + self.activeFutures.clear() + + def ensureModelsDirectory(self) -> str: + """ + Ensure the *models* directory exists (``../../models`` relative to *basePath*). + + :return: Absolute path of the *models* directory. + :raises OSError: When the directory cannot be created. + """ + modelsDir = os.path.abspath(config.conf["automatedImageDescriptions"]["defaultModel"]) + + try: + Path(modelsDir).mkdir(parents=True, exist_ok=True) + except OSError as err: + raise OSError(f"Failed to create models directory {modelsDir}: {err}") from err + else: + log.debug(f"Models directory ensured: {modelsDir}") + return modelsDir + + def constructDownloadUrl( + self, + modelName: str, + filePath: str, + resolvePath: str = "/resolve/main", + ) -> str: + """ + Construct a full download URL for *Hugging Face‑style* repositories. + + :param modelName: Model repository name, e.g. ``Xenova/vit-gpt2-image-captioning``. + :param filePath: Path inside the repo. + :param resolvePath: The branch / ref path, default ``/resolve/main``. + :return: Complete download URL. + """ + remoteHost = self.remoteHost + if not remoteHost.startswith(("http://", "https://")): + remoteHost = f"https://{remoteHost}" + + base = remoteHost.rstrip("/") + model = modelName.strip("/") + ref = resolvePath.strip("/") + filePath = filePath.lstrip("/") + + return f"{base}/{model}/{ref}/{filePath}" + + def _getRemoteFileSize(self, url: str) -> int: + """ + Get remote file size using HEAD request with automatic redirect handling. + + :param url: Remote URL. + :return: File size in bytes, 0 if unable to determine. + """ + if self.cancelRequested: + return 0 + + try: + # Use HEAD request with automatic redirect following + response = self.session.head(url, timeout=10, allow_redirects=True) + response.raise_for_status() + except Exception as e: + if not self.cancelRequested: + log.warning(f"Failed to get remote file size (HEAD) for {url}: {e}") + else: + contentLength = response.headers.get("Content-Length") + if contentLength: + return int(contentLength) + + try: + # If HEAD doesn't work, try GET with range header to get just 1 byte + response = self.session.get(url, headers={"Range": "bytes=0-0"}, timeout=10, allow_redirects=True) + except Exception as e: + if not self.cancelRequested: + log.warning(f"Failed to get remote file size (GET) for {url}: {e}") + else: + if response.status_code == 206: # Partial content + contentRange = response.headers.get("Content-Range", "") + if contentRange and "/" in contentRange: + return int(contentRange.split("/")[-1]) + + return 0 + + def _reportProgress( + self, + callback: ProgressCallback | None, + fileName: str, + downloaded: int, + total: int, + lastReported: int, + ) -> int: + """ + Report download progress if conditions are met. + + :param callback: Progress callback function. + :param fileName: Name of the file being downloaded. + :param downloaded: Bytes downloaded so far. + :param total: Total file size in bytes. + :param lastReported: Last reported download amount. + :return: New lastReported value. + """ + if not callback or total == 0 or self.cancelRequested: + return lastReported + + percent = downloaded / total * 100 + + # Report progress every 1 MiB or 1% or when complete + if ( + downloaded - lastReported >= 1_048_576 # 1 MiB + or abs(percent - lastReported / total * 100) >= 1.0 + or downloaded == total + ): + callback(fileName, downloaded, total, percent) + return downloaded + + return lastReported + + def downloadSingleFile( + self, + url: str, + localPath: str, + progressCallback: ProgressCallback | None = None, + ) -> tuple[bool, str]: + """ + Download a single file with resume support and automatic redirect handling. + + :param url: Remote URL to download from. + :param localPath: Local file path to save the downloaded file. + :param progressCallback: Optional callback function for progress reporting. + :return: Tuple of (success_flag, status_message). + :raises OSError: When directory creation fails. + :raises requests.exceptions.RequestException: When network request fails. + :raises Exception: When unexpected errors occur during download. + """ + if self.cancelRequested: + return False, "Download cancelled" + + threadId = threading.current_thread().ident or 0 + fileName = os.path.basename(localPath) + + # Create destination directory + success, message = self._createDestinationDirectory(localPath) + if not success: + return False, message + + # Get remote file size with redirect handling + remoteSize = self._getRemoteFileSize(url) + + if self.cancelRequested: + return False, "Download cancelled" + + # Check if file already exists and is complete + success, message = self._checkExistingFile( + localPath, + remoteSize, + fileName, + progressCallback, + threadId, + ) + if success is not None: + return success, message + + # Attempt download with retries + return self._downloadWithRetries(url, localPath, fileName, threadId, progressCallback) + + def _createDestinationDirectory(self, localPath: str) -> tuple[bool, str]: + """ + Create destination directory if it doesn't exist. + + :param localPath: Local file path to create directory for. + :return: Tuple of (success_flag, error_message). + :raises OSError: When directory creation fails due to permissions or disk space. + """ + try: + Path(os.path.dirname(localPath)).mkdir(parents=True, exist_ok=True) + return True, "" + except OSError as err: + return False, f"Failed to create directory {localPath}: {err}" + + def _checkExistingFile( + self, + localPath: str, + remoteSize: int, + fileName: str, + progressCallback: ProgressCallback | None, + threadId: int, + ) -> tuple[bool | None, str]: + """ + Check if file already exists and is complete. + + :param localPath: Local file path to check. + :param remoteSize: Size of remote file in bytes. + :param fileName: Base name of the file for progress reporting. + :param progressCallback: Optional callback function for progress reporting. + :param threadId: Current thread identifier for logging. + :return: Tuple of (completion_status, status_message). None status means download should continue. + :raises OSError: When file operations fail. + """ + if not os.path.exists(localPath): + return None, "" + + localSize = os.path.getsize(localPath) + log.debug(f"localSize: {localSize}, remoteSize: {remoteSize}") + + if remoteSize > 0: + if localSize == remoteSize: + if progressCallback and not self.cancelRequested: + progressCallback(fileName, localSize, localSize, 100.0) + log.debug(f"File already complete: {localPath}") + return True, f"File already complete: {localPath}" + elif localSize > remoteSize: + # Local file is larger than remote, may be corrupted + log.warning(f"Local file larger than remote, removing: {localPath}") + try: + os.remove(localPath) + except OSError: + pass + else: + # Cannot get remote size, assume file exists if non-empty + if localSize > 0: + if progressCallback and not self.cancelRequested: + progressCallback(fileName, localSize, localSize, 100.0) + log.debug(f"File already exists (size unknown): {localPath}") + return True, f"File already exists: {localPath}" + + return None, "" + + def _downloadWithRetries( + self, + url: str, + localPath: str, + fileName: str, + threadId: int, + progressCallback: ProgressCallback | None, + ) -> tuple[bool, str]: + """ + Attempt download with retry logic and exponential backoff. + + :param url: Remote URL to download from. + :param localPath: Local file path to save the downloaded file. + :param fileName: Base name of the file for progress reporting. + :param threadId: Current thread identifier for logging. + :param progressCallback: Optional callback function for progress reporting. + :return: Tuple of (success_flag, status_message). + :raises requests.exceptions.HTTPError: When HTTP request fails. + :raises requests.exceptions.RequestException: When network request fails. + :raises Exception: When unexpected errors occur. + """ + for attempt in range(self.maxRetries): + if self.cancelRequested: + return False, "Download cancelled" + + log.debug(f"Downloading (attempt {attempt + 1}/{self.maxRetries}): {url}") + + try: + success, message = self._performSingleDownload( + url, + localPath, + fileName, + threadId, + progressCallback, + ) + + except requests.exceptions.HTTPError as e: + message = self._handleHttpError(e, localPath, fileName, progressCallback, threadId) + if message.startswith("Download completed"): + return True, message + + except RequestException as e: + if self.cancelRequested: + return False, "Download cancelled" + message = f"Request error: {str(e)}" + + except Exception as e: + if self.cancelRequested: + return False, "Download cancelled" + message = f"Unexpected error: {str(e)}" + log.error(message) + + else: + if success: + return True, message + + if not self.cancelRequested: + log.debug(f"{message} – {url}") + if attempt < self.maxRetries - 1: + success = self._waitForRetry(attempt, threadId) + if not success: + return False, "Download cancelled" + else: + return False, message + + return False, "Maximum retries exceeded" + + def _performSingleDownload( + self, + url: str, + localPath: str, + fileName: str, + threadId: int, + progressCallback: ProgressCallback | None, + ) -> tuple[bool, str]: + """ + Perform a single download attempt with resume support. + + :param url: Remote URL to download from. + :param localPath: Local file path to save the downloaded file. + :param fileName: Base name of the file for progress reporting. + :param threadId: Current thread identifier for logging. + :param progressCallback: Optional callback function for progress reporting. + :return: Tuple of (success_flag, status_message). + :raises requests.exceptions.HTTPError: When HTTP request fails. + :raises requests.exceptions.RequestException: When network request fails. + :raises Exception: When file operations or unexpected errors occur. + """ + # Check for existing partial file + resumePos = self._getResumePosition(localPath, threadId) + + # Get response with resume support + response = self._getDownloadResponse(url, resumePos, localPath, threadId) + + if self.cancelRequested: + return False, "Download cancelled" + + try: + # Determine total file size + total = self._calculateTotalSize(response, resumePos) + + if total > 0: + log.debug(f"Total file size: {total:,} bytes") + + # Download file content + success, message = self._downloadFileContent( + response, + localPath, + fileName, + resumePos, + total, + progressCallback, + ) + + if not success: + return False, message + + # Verify download integrity + return self._verifyDownloadIntegrity(localPath, fileName, total, progressCallback, threadId) + + finally: + response.close() + + def _getResumePosition(self, localPath: str, threadId: int) -> int: + """ + Get resume position for partial download. + + :param localPath: Local file path to check. + :param threadId: Current thread identifier for logging. + :return: Byte position to resume from. + :raises OSError: When file operations fail. + """ + resumePos = 0 + if os.path.exists(localPath): + resumePos = os.path.getsize(localPath) + log.debug(f"Resuming from byte {resumePos}") + return resumePos + + def _getDownloadResponse(self, url: str, resumePos: int, localPath: str, threadId: int) -> Response: + """ + Get download response with resume support and redirect handling. + + :param url: Remote URL to download from. + :param resumePos: Byte position to resume from. + :param localPath: Local file path for cleanup if needed. + :param threadId: Current thread identifier for logging. + :return: HTTP response object. + :raises requests.exceptions.HTTPError: When HTTP request fails. + :raises requests.exceptions.RequestException: When network request fails. + :raises Exception: When download is cancelled. + """ + # Set up headers for resume + headers = {} + if resumePos > 0: + headers["Range"] = f"bytes={resumePos}-" + + # Make request with automatic redirect handling + response = self.session.get( + url, + headers=headers, + stream=True, + timeout=10, + allow_redirects=True, + ) + + # Check if resume is supported + if resumePos > 0 and response.status_code != 206: + log.debug("Server doesn't support resume, starting from beginning") + if os.path.exists(localPath): + try: + os.remove(localPath) + except OSError: + pass + + if self.cancelRequested: + response.close() + raise Exception("Download cancelled") + + # Make new request without range header + response.close() + response = self.session.get(url, stream=True, timeout=10, allow_redirects=True) + + response.raise_for_status() + return response + + def _calculateTotalSize(self, response: Response, resumePos: int) -> int: + """ + Calculate total file size from HTTP response headers. + + :param response: HTTP response object. + :param resumePos: Byte position resumed from. + :return: Total file size in bytes. + :raises ValueError: When Content-Range header is malformed. + """ + if response.status_code == 206: + # Partial content response + contentRange = response.headers.get("Content-Range", "") + if contentRange and "/" in contentRange: + return int(contentRange.split("/")[-1]) + else: + return int(response.headers.get("Content-Length", "0")) + resumePos + else: + return int(response.headers.get("Content-Length", "0")) + + def _downloadFileContent( + self, + response, + localPath: str, + fileName: str, + resumePos: int, + total: int, + progressCallback: ProgressCallback | None, + ) -> tuple[bool, str]: + """ + Download file content with progress reporting and cancellation support. + + :param response: HTTP response object to read from. + :param localPath: Local file path to write to. + :param fileName: Base name of the file for progress reporting. + :param resumePos: Byte position resumed from. + :param total: Total file size in bytes. + :param progressCallback: Optional callback function for progress reporting. + :return: Tuple of (success_flag, error_message). + :raises OSError: When file write operations fail. + :raises Exception: When download is cancelled or unexpected errors occur. + """ + downloaded = resumePos + lastReported = downloaded + mode = "ab" if resumePos > 0 else "wb" + + try: + with open(localPath, mode) as fh: + for chunk in response.iter_content(chunk_size=CHUNK_SIZE): + if self.cancelRequested: + return False, "Download cancelled" + + if chunk: # filter out keep-alive chunks + fh.write(chunk) + downloaded += len(chunk) + + if total > 0: + lastReported = self._reportProgress( + progressCallback, + fileName, + downloaded, + total, + lastReported, + ) + except Exception as e: + return False, f"Failed to write file: {str(e)}" + + return True, "" + + def _verifyDownloadIntegrity( + self, + localPath: str, + fileName: str, + total: int, + progressCallback: ProgressCallback | None, + threadId: int, + ) -> tuple[bool, str]: + """ + Verify download integrity and report final progress. + + :param localPath: Local file path to verify. + :param fileName: Base name of the file for progress reporting. + :param total: Expected total file size in bytes. + :param progressCallback: Optional callback function for progress reporting. + :param threadId: Current thread identifier for logging. + :return: Tuple of (success_flag, status_message). + :raises OSError: When file operations fail. + """ + if self.cancelRequested: + return False, "Download cancelled" + + actualSize = os.path.getsize(localPath) + + if actualSize == 0: + return False, "Downloaded file is empty" + + if total > 0 and actualSize != total: + return False, f"File incomplete: {actualSize}/{total} bytes downloaded" + + # Final progress callback + if progressCallback and not self.cancelRequested: + progressCallback(fileName, actualSize, max(total, actualSize), 100.0) + + log.debug(f"Successfully downloaded: {localPath}") + return True, "Download completed" + + def _handleHttpError( + self, + error: requests.exceptions.HTTPError, + localPath: str, + fileName: str, + progressCallback: ProgressCallback | None, + threadId: int, + ) -> str: + """ + Handle HTTP errors with special handling for range not satisfiable. + + :param error: HTTP error exception. + :param localPath: Local file path to check for completion. + :param fileName: Base name of the file for progress reporting. + :param progressCallback: Optional callback function for progress reporting. + :param threadId: Current thread identifier for logging. + :return: Error message or completion status. + :raises OSError: When file operations fail. + """ + if error.response is not None and error.response.status_code == 416: # Range Not Satisfiable + if os.path.exists(localPath): + actualSize = os.path.getsize(localPath) + if actualSize > 0: + log.debug(f"File appears to be complete: {localPath}") + if progressCallback and not self.cancelRequested: + progressCallback(fileName, actualSize, actualSize, 100.0) + return "Download completed" + + return f"HTTP {error.response.status_code if error.response else 'Error'}: {str(error)}" + + def _waitForRetry(self, attempt: int, threadId: int) -> bool: + """ + Wait for retry with exponential backoff and cancellation support. + + :param attempt: Current retry attempt number. + :param threadId: Current thread identifier for logging. + :return: True if wait completed, False if cancelled. + """ + wait = BACKOFF_BASE**attempt + log.debug(f"Waiting {wait}s before retry...") + + for _ in range(wait): + if self.cancelRequested: + return False + time.sleep(1) + + return True + + def downloadModelsMultithreaded( + self, + modelsDir: str = WritePaths.modelsDir, + modelName: str = "Xenova/vit-gpt2-image-captioning", + filesToDownload: list[str] | None = None, + resolvePath: str = "/resolve/main", + progressCallback: ProgressCallback | None = None, + ) -> tuple[list[str], list[str]]: + """ + Download multiple model assets concurrently. + + :param modelsDir: Base *models* directory. + :param modelName: Repository name. + :param filesToDownload: Explicit file list; None uses common defaults. + :param resolvePath: Branch / ref path. + :param progressCallback: Optional progress callback. + :return: (successful_paths, failed_paths) tuple. + """ + if not self.remoteHost or not modelName: + raise ValueError("remoteHost and modelName cannot be empty") + + filesToDownload = filesToDownload or [ + "onnx/encoder_model_quantized.onnx", + "onnx/decoder_model_merged_quantized.onnx", + "config.json", + "vocab.json", + "preprocessor_config.json", + ] + + if not filesToDownload: + raise ValueError("filesToDownload cannot be empty") + + log.debug( + f"Starting download of {len(filesToDownload)} files for model: {modelName}\n" + f"Remote host: {self.remoteHost}\nMax workers: {self.maxWorkers}", + ) + + localModelDir = os.path.join(modelsDir, modelName) + successful: list[str] = [] + failed: list[str] = [] + + with ThreadPoolExecutor(max_workers=self.maxWorkers) as executor: + futures = [] + + for path in filesToDownload: + if self.cancelRequested: + break + + future = executor.submit( + self.downloadSingleFile, + self.constructDownloadUrl(modelName, path, resolvePath), + os.path.join(localModelDir, path), + progressCallback, + ) + futures.append((future, path)) + + # Track active futures for cancellation + with self.downloadLock: + self.activeFutures.add(future) + + # Process completed futures + for future, filePath in futures: + if self.cancelRequested: + # Cancel remaining futures but don't wait for them + with self.downloadLock: + for f, _ in futures: + if not f.done(): + f.cancel() + break + + # Remove from active futures tracking + with self.downloadLock: + self.activeFutures.discard(future) + + try: + ok, msg = future.result() + if ok: + successful.append(filePath) + log.debug(f"successful {filePath=}") + else: + failed.append(filePath) + log.debug(f"failed: {filePath} - {msg}") + except Exception as err: + failed.append(filePath) + log.debug(f"failed: {filePath} – {err}") + + # Summary + if not self.cancelRequested: + log.debug(f"Total: {len(filesToDownload)}") + log.debug(f"Successful: {len(successful)}") + log.debug(f"Failed: {len(failed)}") + log.debug(f"\nLocal model directory: {localModelDir}") + else: + log.debug("Download cancelled by user") + + return successful, failed + + def __del__(self): + """Clean up the session when the downloader is destroyed.""" + if hasattr(self, "session"): + self.session.close() diff --git a/source/config/__init__.py b/source/config/__init__.py index 93c66f28c61..0fbd3cdb98e 100644 --- a/source/config/__init__.py +++ b/source/config/__init__.py @@ -416,6 +416,7 @@ class ConfigManager(object): "development", "addonStore", "remote", + "automatedImageDescriptions", "math", "screenCurtain", } diff --git a/source/config/configSpec.py b/source/config/configSpec.py index da4c436b58d..3696b6fc169 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -97,7 +97,7 @@ reportLiveRegions = featureFlag(optionsEnum="BoolFlag", behaviorOfDefault="enabled") fontFormattingDisplay = featureFlag(optionsEnum="FontFormattingBrailleModeFlag", behaviorOfDefault="LIBLOUIS") [[auto]] - excludedDisplays = string_list(default=list("dotPad")) + excludedDisplays = string_list(default=list("dotPad")) # Braille display driver settings [[__many__]] @@ -373,23 +373,23 @@ [math] [[speech]] # LearningDisability, Blindness, LowVision - impairment = string(default="Blindness") + impairment = string(default="Blindness") # any known language code and sub-code -- could be en-uk, etc - language = string(default="Auto") + language = string(default="Auto") # Any known speech style (falls back to ClearSpeak) - speechStyle = string(default="ClearSpeak") + speechStyle = string(default="ClearSpeak") # Terse, Medium, Verbose - verbosity = string(default="Medium") + verbosity = string(default="Medium") # Change from text speech rate (%) - mathRate = integer(default=100) + mathRate = integer(default=100) # Change from normal pause length (%) - pauseFactor = integer(default=100) + pauseFactor = integer(default=100) # make a sound when starting/ending math speech -- None, Beep - speechSound = string(default="None") + speechSound = string(default="None") # NOTE: not currently working in MathCAT - subjectArea = string(default="General") + subjectArea = string(default="General") # SpellOut (H 2 0), AsCompound (Water) -- not implemented, Off (H sub 2 O) - chemistry = string(default="SpellOut") + chemistry = string(default="SpellOut") # Verbose, Brief, SuperBrief mathSpeak = string(default="Verbose") @@ -529,6 +529,10 @@ # Auto, '.', ',', Custom decimalSeparator = string(default="Auto") +[automatedImageDescriptions] + enable = boolean(default=false) + defaultModel = string(default="Xenova/vit-gpt2-image-captioning") + [screenCurtain] enabled = boolean(default=false) warnOnLoad = boolean(default=true) diff --git a/source/core.py b/source/core.py index a6ff433c74f..113b88ac5bd 100644 --- a/source/core.py +++ b/source/core.py @@ -555,6 +555,7 @@ def _handleNVDAModuleCleanupBeforeGUIExit(): import globalPluginHandler import watchdog import _remoteClient + import _localCaptioner try: import updateCheck @@ -573,6 +574,8 @@ def _handleNVDAModuleCleanupBeforeGUIExit(): # Terminating remoteClient causes it to clean up its menus, so do it here while they still exist _terminate(_remoteClient) + _terminate(_localCaptioner) + def _initializeObjectCaches(): """ @@ -916,6 +919,10 @@ def main(): _remoteClient.initialize() + import _localCaptioner + + _localCaptioner.initialize() + if globalVars.appArgs.install or globalVars.appArgs.installSilent: import gui.installerGui diff --git a/source/globalCommands.py b/source/globalCommands.py index 92d009eb518..25984a1ba35 100755 --- a/source/globalCommands.py +++ b/source/globalCommands.py @@ -72,6 +72,7 @@ import synthDriverHandler from utils.displayString import DisplayStringEnum import _remoteClient +import _localCaptioner #: Script category for text review commands. # Translators: The name of a category of NVDA commands. @@ -124,6 +125,9 @@ #: Script category for Remote Access commands. # Translators: The name of a category of NVDA commands. SCRCAT_REMOTE = pgettext("remote", "Remote Access") +#: Script category for image description commands. +# Translators: The name of a category of NVDA commands. +SCRCAT_IMAGE_DESC = pgettext("imageDesc", "Image Descriptions") # Translators: Reported when there are no settings to configure in synth settings ring # (example: when there is no setting for language). @@ -3517,6 +3521,15 @@ def script_activateDocumentFormattingDialog(self, gesture): def script_activateRemoteAccessSettings(self, gesture: "inputCore.InputGesture"): wx.CallAfter(gui.mainFrame.onRemoteAccessSettingsCommand, None) + @script( + # Translators: Input help mode message for go to local captioner settings command. + description=pgettext("imageDesc", "Shows the AI image descriptions settings"), + category=SCRCAT_CONFIG, + ) + @gui.blockAction.when(gui.blockAction.Context.MODAL_DIALOG_OPEN) + def script_activateLocalCaptionerSettings(self, gesture: "inputCore.InputGesture"): + wx.CallAfter(gui.mainFrame.onLocalCaptionerSettingsCommand, None) + @script( # Translators: Input help mode message for go to Add-on Store settings command. description=_("Shows NVDA's Add-on Store settings"), @@ -5143,6 +5156,30 @@ def script_repeatLastSpokenInformation(self, gesture: "inputCore.InputGesture") title = _("Last spoken information") ui.browseableMessage(lastSpeechText, title, copyButton=True, closeButton=True) + @script( + description=pgettext( + "imageDesc", + # Translators: Description for the image caption script + "Get an AI-generated image description of the navigator object.", + ), + category=SCRCAT_IMAGE_DESC, + gesture="kb:NVDA+g", + ) + @gui.blockAction.when(gui.blockAction.Context.SCREEN_CURTAIN) + def script_runCaption(self, gesture: "inputCore.InputGesture"): + _localCaptioner._localCaptioner.runCaption(gesture) + + @script( + description=pgettext( + "imageDesc", + # Translators: Description for the toggle image captioning script + "Load or unload the image captioner", + ), + category=SCRCAT_IMAGE_DESC, + ) + def script_toggleImageCaptioning(self, gesture: "inputCore.InputGesture"): + _localCaptioner._localCaptioner.toggleImageCaptioning(gesture) + #: The single global commands instance. #: @type: L{GlobalCommands} diff --git a/source/gui/__init__.py b/source/gui/__init__.py index 023e787e177..5391ccce5d3 100644 --- a/source/gui/__init__.py +++ b/source/gui/__init__.py @@ -56,6 +56,7 @@ GeneralSettingsPanel, InputCompositionPanel, KeyboardSettingsPanel, + LocalCaptionerSettingsPanel, MouseSettingsPanel, MultiCategorySettingsDialog, NVDASettingsDialog, @@ -387,6 +388,10 @@ def onUwpOcrCommand(self, evt): def onRemoteAccessSettingsCommand(self, evt): self.popupSettingsDialog(NVDASettingsDialog, RemoteSettingsPanel) + @blockAction.when(blockAction.Context.SECURE_MODE) + def onLocalCaptionerSettingsCommand(self, evt): + self.popupSettingsDialog(NVDASettingsDialog, LocalCaptionerSettingsPanel) + @blockAction.when(blockAction.Context.SECURE_MODE) def onAdvancedSettingsCommand(self, evt: wx.CommandEvent): self.popupSettingsDialog(NVDASettingsDialog, AdvancedPanel) diff --git a/source/gui/_localCaptioner/__init__.py b/source/gui/_localCaptioner/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/source/gui/_localCaptioner/messageDialogs.py b/source/gui/_localCaptioner/messageDialogs.py new file mode 100644 index 00000000000..c7a3e7c32cd --- /dev/null +++ b/source/gui/_localCaptioner/messageDialogs.py @@ -0,0 +1,198 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt + +from gui.message import MessageDialog, DefaultButton, ReturnCode, DialogType +import gui +from _localCaptioner.modelDownloader import ModelDownloader, ProgressCallback +import threading +from threading import Thread +import wx +import ui +import _localCaptioner + + +class ImageDescDownloader: + _downloadThread: Thread | None = None + isOpening: bool = False + + def __init__(self): + self.downloadDict: dict[str, tuple[int, int]] = {} + self.modelDownloader: ModelDownloader | None = None + self._shouldCancel = False + self._progressDialog: wx.ProgressDialog | None = None + self.filesToDownload = [ + "onnx/encoder_model_quantized.onnx", + "onnx/decoder_model_merged_quantized.onnx", + "config.json", + "vocab.json", + "preprocessor_config.json", + ] + + def onDownload(self, progressCallback: ProgressCallback) -> None: + self.modelDownloader = ModelDownloader() + (success, fail) = self.modelDownloader.downloadModelsMultithreaded( + filesToDownload=self.filesToDownload, + progressCallback=progressCallback, + ) + if len(fail) == 0: + wx.CallAfter(self.openSuccessDialog) + else: + wx.CallAfter(self.openFailDialog) + + def openSuccessDialog(self) -> None: + confirmationButton = (DefaultButton.OK.value._replace(defaultFocus=True, fallbackAction=True),) + self._stopped() + + dialog = MessageDialog( + parent=None, + # Translators: title of dialog when download successfully + title=pgettext("imageDesc", "Download successful"), + message=pgettext( + "imageDesc", + # Translators: label of dialog when downloading image captioning + "Image captioning installed successfully.", + ), + dialogType=DialogType.STANDARD, + buttons=confirmationButton, + ) + + if dialog.ShowModal() == ReturnCode.OK: + # load image desc after successful download + if not _localCaptioner.isModelLoaded(): + _localCaptioner.toggleImageCaptioning() + + def openFailDialog(self) -> None: + if self._shouldCancel: + return + + confirmationButtons = ( + DefaultButton.YES.value._replace(defaultFocus=True, fallbackAction=False), + DefaultButton.NO.value._replace(defaultFocus=False, fallbackAction=True), + ) + + dialog = MessageDialog( + parent=None, + # Translators: title of dialog when fail to download + title=pgettext("imageDesc", "Download failed"), + message=pgettext( + "imageDesc", + # Translators: label of dialog when fail to download image captioning + "Image captioning download failed. Would you like to retry?", + ), + dialogType=DialogType.WARNING, + buttons=confirmationButtons, + ) + + if dialog.ShowModal() == ReturnCode.YES: + self.doDownload() + else: + self._stopped() + + def openDownloadDialog(self) -> None: + if ImageDescDownloader._downloadThread is not None and ImageDescDownloader._downloadThread.is_alive(): + # Translators: message when image captioning is still downloading + ui.message(pgettext("imageDesc", "image captioning is still downloading, please wait...")) + return + if ImageDescDownloader.isOpening: + return + + confirmationButtons = ( + DefaultButton.YES.value._replace(defaultFocus=True, fallbackAction=False), + DefaultButton.NO.value._replace(defaultFocus=False, fallbackAction=True), + ) + + dialog = MessageDialog( + parent=None, + # Translators: title of dialog when downloading Image captioning + title=pgettext("imageDesc", "Confirm download"), + message=pgettext( + "imageDesc", + # Translators: label of dialog when downloading image captioning + "Image captioning not installed. Would you like to install (235 MB)?", + ), + dialogType=DialogType.WARNING, + buttons=confirmationButtons, + ) + ImageDescDownloader.isOpening = True + + if dialog.ShowModal() == ReturnCode.YES: + self._progressDialog = wx.ProgressDialog( + # Translators: The title of the dialog displayed while downloading image descriptioner. + pgettext("imageDesc", "Downloading Image Descriptioner"), + # Translators: The progress message indicating that a connection is being established. + pgettext("imageDesc", "Connecting"), + style=wx.PD_CAN_ABORT | wx.PD_ELAPSED_TIME | wx.PD_REMAINING_TIME | wx.PD_AUTO_HIDE, + parent=gui.mainFrame, + ) + self.doDownload() + else: + ImageDescDownloader.isOpening = False + + def doDownload(self): + def progressCallback( + fileName: str, + downloadedBytes: int, + totalBytes: int, + _percentage: float, + ) -> None: + """Callback function to capture progress data.""" + self.downloadDict[fileName] = (downloadedBytes, totalBytes) + downloadedSum = sum(d for d, _ in self.downloadDict.values()) + totalSum = sum(t for _, t in self.downloadDict.values()) + ratio = downloadedSum / totalSum if totalSum > 0 else 0.0 + totalProgress = int(ratio * 100) + # update progress when downloading all files to prevent premature stop + if len(self.downloadDict) == len(self.filesToDownload): + # Translators: The progress message indicating that a download is in progress. + cont, skip = self._progressDialog.Update(totalProgress, pgettext("imageDesc", "downloading")) + if not cont: + self._shouldCancel = True + self._stopped() + + ImageDescDownloader._downloadThread = threading.Thread( + target=self.onDownload, + name="ModelDownloadMainThread", + daemon=False, + args=(progressCallback,), + ) + ImageDescDownloader._downloadThread.start() + + def _stopped(self): + self.modelDownloader.requestCancel() + ImageDescDownloader._downloadThread = None + self._progressDialog.Hide() + self._progressDialog.Destroy() + self._progressDialog = None + ImageDescDownloader.isOpening = False + + +def openEnableOnceDialog() -> None: + confirmationButtons = ( + DefaultButton.YES.value._replace(defaultFocus=True, fallbackAction=False), + DefaultButton.NO.value._replace(defaultFocus=False, fallbackAction=True), + ) + + dialog = MessageDialog( + parent=None, + # Translators: title of dialog when enable image desc + title=pgettext("imageDesc", "Enable AI image descriptions"), + message=pgettext( + "imageDesc", + # Translators: label of dialog when enable image desc + "AI image descriptions are currently disabled." + "\n\n" + "Warning: AI image descriptions are experimental. " + "Do not use this feature in circumstances where inaccurate descriptions could cause harm." + "\n\n" + "Would you like to temporarily enable AI image descriptions now?", + ), + dialogType=DialogType.STANDARD, + buttons=confirmationButtons, + ) + + if dialog.ShowModal() == ReturnCode.YES: + # load image desc in this session + if not _localCaptioner.isModelLoaded(): + _localCaptioner.toggleImageCaptioning() diff --git a/source/gui/blockAction.py b/source/gui/blockAction.py index 008ade69abf..21fbe4ab8f3 100644 --- a/source/gui/blockAction.py +++ b/source/gui/blockAction.py @@ -39,6 +39,14 @@ def _isRemoteAccessDisabled() -> bool: return not remoteRunning() +def _isScreenCurtainEnabled() -> bool: + """Whether screen curtain functionality is **enabled**.""" + # Import late to avoid circular import + from screenCurtain import screenCurtain + + return screenCurtain is not None and screenCurtain.enabled + + @dataclass class _Context: blockActionIf: Callable[[], bool] @@ -86,6 +94,11 @@ class Context(_Context, Enum): # Translators: Reported when an action cannot be performed because Remote Access functionality is disabled. pgettext("remote", "Action unavailable when Remote Access is disabled"), ) + SCREEN_CURTAIN = ( + lambda: _isScreenCurtainEnabled(), + # Translators: Reported when an action cannot be performed because screen curtain is enabled. + _("Action unavailable while screen curtain is enabled"), + ) def when(*contexts: Context): diff --git a/source/gui/settingsDialogs.py b/source/gui/settingsDialogs.py index 2370134380a..507504d9ef5 100644 --- a/source/gui/settingsDialogs.py +++ b/source/gui/settingsDialogs.py @@ -4027,6 +4027,53 @@ def onSave(self): _remoteClient.terminate() +class LocalCaptionerSettingsPanel(SettingsPanel): + """Settings panel for Local captioner configuration.""" + + # Translators: This is the label for the local captioner settings panel. + title = pgettext("imageDesc", "AI Image Descriptions") + helpId = "LocalCaptionerSettings" + panelDescription = pgettext( + "imageDesc", + # Translators: This is a label appearing on the AI Image Descriptions settings panel. + "Warning: AI image descriptions are experimental. " + "Do not use this feature in circumstances where inaccurate descriptions could cause harm.", + ) + + def makeSettings(self, settingsSizer: wx.BoxSizer): + """Create the settings controls for the panel. + + :param settingsSizer: The sizer to add settings controls to. + """ + + sHelper = guiHelper.BoxSizerHelper(self, sizer=settingsSizer) + + self.windowText = sHelper.addItem( + wx.StaticText(self, label=self.panelDescription), + ) + self.windowText.Wrap(self.scaleSize(PANEL_DESCRIPTION_WIDTH)) + + self.enable = sHelper.addItem( + # Translators: A configuration in settings dialog. + wx.CheckBox(self, label=pgettext("imageDesc", "Enable image captioner")), + ) + self.enable.SetValue(config.conf["automatedImageDescriptions"]["enable"]) + self.bindHelpEvent("LocalCaptionToggle", self.enable) + + def onSave(self) -> None: + """Save the configuration settings.""" + enabled = self.enable.GetValue() + oldEnabled = config.conf["automatedImageDescriptions"]["enable"] + + if enabled != oldEnabled: + import _localCaptioner + + if enabled != _localCaptioner.isModelLoaded(): + _localCaptioner.toggleImageCaptioning() + + config.conf["automatedImageDescriptions"]["enable"] = enabled + + class TouchInteractionPanel(SettingsPanel): # Translators: This is the label for the touch interaction settings panel. title = _("Touch Interaction") @@ -6089,6 +6136,7 @@ class NVDASettingsDialog(MultiCategorySettingsDialog): DocumentNavigationPanel, MathSettingsPanel, RemoteSettingsPanel, + LocalCaptionerSettingsPanel, ] # In secure mode, add-on update is disabled, so AddonStorePanel should not appear since it only contains # add-on update related controls. @@ -6117,6 +6165,7 @@ def _doOnCategoryChange(self): or isinstance(self.currentCategory, GeneralSettingsPanel) or isinstance(self.currentCategory, AddonStorePanel) or isinstance(self.currentCategory, RemoteSettingsPanel) + or isinstance(self.currentCategory, LocalCaptionerSettingsPanel) or isinstance(self.currentCategory, MathSettingsPanel) or isinstance(self.currentCategory, PrivacyAndSecuritySettingsPanel) ): diff --git a/source/setup.py b/source/setup.py index d7cb15b0029..bba5f0dbcc3 100755 --- a/source/setup.py +++ b/source/setup.py @@ -213,8 +213,6 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]: # winxptheme is optionally used by wx.lib.agw.aui. # We don't need this. "winxptheme", - # numpy is an optional dependency of comtypes but we don't require it. - "numpy", # multiprocessing isn't going to work in a frozen environment "multiprocessing", "concurrent.futures.process", @@ -246,6 +244,8 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]: "mdx_truly_sane_lists", "mdx_gh_links", "pymdownx", + # Required for local image captioning + "numpy", ], "includes": [ "nvdaBuiltin", @@ -253,6 +253,9 @@ def _genManifestTemplate(shouldHaveUIAccess: bool) -> tuple[int, int, bytes]: "bisect", # robotremoteserver (for system tests) depends on xmlrpc.server "xmlrpc.server", + # required for import numpy without error + "numpy._core._exceptions", + "numpy._core._multiarray_umath", ], }, data_files=[ diff --git a/tests/system/libraries/SystemTestSpy/configManager.py b/tests/system/libraries/SystemTestSpy/configManager.py index fda1830f67d..ae9286af9ea 100644 --- a/tests/system/libraries/SystemTestSpy/configManager.py +++ b/tests/system/libraries/SystemTestSpy/configManager.py @@ -105,6 +105,9 @@ def setupProfile( _pJoin(repoRoot, "tests", "system", "nvdaSettingsFiles", settingsFileName), _pJoin(stagingDir, "nvdaProfile", "nvda.ini"), ) + if _shouldGenerateMockModel(_pJoin(stagingDir, "nvdaProfile", "nvda.ini")): + _configModels(_pJoin(stagingDir, "nvdaProfile", "models", "mock", "vit-gpt2-image-captioning")) + if gesturesFileName is not None: opSys.copy_file( # Despite duplication, specify full paths for clarity. @@ -128,3 +131,26 @@ def teardownProfile(stagingDir: str): _pJoin(stagingDir, "nvdaProfile"), recursive=True, ) + + +def _configModels(modelsDirectory: str) -> None: + from .mockModels import MockVisionEncoderDecoderGenerator + + generator = MockVisionEncoderDecoderGenerator(randomSeed=8) + generator.generateAllFiles(modelsDirectory) + + +def _shouldGenerateMockModel(iniPath: str) -> bool: + # Read original lines + with open(iniPath, "r", encoding="utf-8") as f: + lines = f.readlines() + + for line in lines: + # Detect section headers + stripLine = line.strip() + if stripLine.startswith("[") and stripLine.endswith("]"): + hasCaptionSection = stripLine.lower() == "[automatedimagedescriptions]" + if hasCaptionSection: + return True + else: + continue diff --git a/tests/system/libraries/SystemTestSpy/mockModels.py b/tests/system/libraries/SystemTestSpy/mockModels.py new file mode 100644 index 00000000000..896b3901fb2 --- /dev/null +++ b/tests/system/libraries/SystemTestSpy/mockModels.py @@ -0,0 +1,793 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later. +# For more details see: https://www.gnu.org/licenses/gpl-2.0.html +""" +Mock Vision-Encoder-Decoder Model Generator + +This module provides a class to generate mock ONNX models and configuration files +for a Vision-Encoder-Decoder model (ViT-GPT2 style) used for image captioning. +The generated files can be used for testing and development purposes. +""" + +import os +import json +from pathlib import Path +from typing import Any + +import numpy as np +import onnx +from onnx import helper, TensorProto, numpy_helper + + +class MockVisionEncoderDecoderGenerator: + """ + A class to generate mock ONNX models and configuration files for a + Vision-Encoder-Decoder model architecture. + + This generator creates: + - onnx/encoder_model_quantized.onnx: Vision Transformer encoder + - onnx/decoder_model_merged_quantized.onnx: GPT-2 style decoder + - config.json: Model configuration + - vocab.json: Vocabulary mapping + """ + + def __init__(self, randomSeed: int = 8): + """ + Initialize the mock model generator. + + :param randomSeed (int): Random seed for reproducible weight generation.Defaults to 8. + """ + self.randomSeed = randomSeed + self._setRandomSeed() + + # Model hyperparameters + self.vocab_size = 100 + self.hidden_size = 64 + self.n_layers = 12 + self.image_size = 224 + self.patch_size = 16 + self.num_channels = 3 + + # Derived parameters + self.num_patches = (self.image_size // self.patch_size) ** 2 + + def _setRandomSeed(self) -> None: + """Set random seed for reproducible results.""" + np.random.seed(self.randomSeed) + + def generateAllFiles(self, outputDir: str) -> None: + """ + Generate all mock model files in the specified directory. + + :param outputDir (str): Target directory to create the model files. Will create the directory if it doesn't exist. + """ + outputPath = Path(outputDir) + outputPath.mkdir(parents=True, exist_ok=True) + + # Create onnx subdirectory + onnxDir = outputPath / "onnx" + onnxDir.mkdir(exist_ok=True) + + # Generate all components + self._generateEncoderModel(os.path.join(onnxDir, "encoder_model_quantized.onnx")) + self._generateDecoderModel(os.path.join(onnxDir, "decoder_model_merged_quantized.onnx")) + self._generateConfigFile(os.path.join(outputPath, "config.json")) + self._generateVocabFile(os.path.join(outputPath, "vocab.json")) + + def _generateEncoderModel(self, outputPath: Path) -> None: + """ + Generate the Vision Transformer encoder ONNX model. + + This creates a simplified ViT encoder that performs patch embedding + using convolution followed by reshaping operations. + + :param outputPath (Path): Output path for the encoder ONNX file. + """ + # Define input and output specifications + pixelValues = helper.make_tensor_value_info( + "pixelValues", + TensorProto.FLOAT, + ["batch", self.num_channels, self.image_size, self.image_size], + ) + + patchEmbeds = helper.make_tensor_value_info( + "patchEmbeds", + TensorProto.FLOAT, + ["batch", self.num_patches, self.hidden_size], + ) + + # Generate random but reproducible weights for patch embedding + convWeights = np.random.randn( + self.hidden_size, + self.num_channels, + self.patch_size, + self.patch_size, + ).astype(np.float32) + + convBias = np.zeros(self.hidden_size, dtype=np.float32) + + # Create initializers + weightInit = numpy_helper.from_array(convWeights, "convWeights") + biasInit = numpy_helper.from_array(convBias, "convBias") + + # Shape constant for reshaping + targetShape = np.array([0, self.num_patches, self.hidden_size], dtype=np.int64) + shapeInit = numpy_helper.from_array(targetShape, "targetShape") + + # Define computation nodes + nodes = [ + # Patch embedding using convolution + helper.make_node( + "Conv", + inputs=["pixelValues", "convWeights", "convBias"], + outputs=["conv_output"], + kernel_shape=[self.patch_size, self.patch_size], + strides=[self.patch_size, self.patch_size], + ), + # Transpose to get correct dimension order + # From [batch, hidden_size, patch_h, patch_w] to [batch, patch_h, patch_w, hidden_size] + helper.make_node( + "Transpose", + inputs=["conv_output"], + outputs=["transposed_output"], + perm=[0, 2, 3, 1], + ), + # Reshape to flatten patches + # From [batch, patch_h, patch_w, hidden_size] to [batch, num_patches, hidden_size] + helper.make_node( + "Reshape", + inputs=["transposed_output", "targetShape"], + outputs=["patchEmbeds"], + ), + ] + + # Create and save the model + graph = helper.make_graph( + nodes=nodes, + name="VisionTransformerEncoder", + inputs=[pixelValues], + outputs=[patchEmbeds], + initializer=[weightInit, biasInit, shapeInit], + ) + + model = helper.make_model(graph, producer_name="mock-vit-encoder") + model.opset_import[0].version = 13 + model.ir_version = 10 + + onnx.save(model, str(outputPath)) + + def _generateDecoderModel(self, outputPath: Path) -> None: + """ + Generate the GPT-2 style decoder ONNX model. + + This creates a simplified decoder that accepts multiple inputs including + token IDs, encoder hidden states, cache flags, and past key-value pairs. + + :param outputPath (Path): Output path for the decoder ONNX file. + """ + # Generate fixed random weights for reproducibility + embeddingWeights = np.random.randn( + self.vocab_size, + self.hidden_size, + ).astype(np.float32) + + projectionWeights = np.random.randn( + self.hidden_size, + self.vocab_size, + ).astype(np.float32) + + # Create weight initializers + embInit = numpy_helper.from_array(embeddingWeights, "embeddingWeights") + projInit = numpy_helper.from_array(projectionWeights, "projectionWeights") + + # Define all input specifications + inputs = self._createDecoderInputs() + + # Define output specification + outputs = [ + helper.make_tensor_value_info( + "logits", + TensorProto.FLOAT, + ["batch", "seq", self.vocab_size], + ), + ] + + # Create computation nodes + nodes = self._createDecoderNodes() + + # Create shape and scaling constants + shapeConstants = self._createDecoderConstants() + + # Combine all initializers + initializers = [embInit, projInit] + shapeConstants + + # Create and save the model + graph = helper.make_graph( + nodes=nodes, + name="GPT2DecoderWithCache", + inputs=inputs, + outputs=outputs, + initializer=initializers, + ) + + model = helper.make_model(graph, producer_name="mock-gpt2-decoder") + model.opset_import[0].version = 13 + model.ir_version = 10 + + onnx.save(model, str(outputPath)) + + def _createDecoderInputs(self) -> list: + """ + Create input specifications for the decoder model. + + :return: list: List of tensor value info objects for all decoder inputs. + """ + inputs = [] + + # Primary inputs + inputs.extend( + [ + helper.make_tensor_value_info( + "input_ids", + TensorProto.INT64, + ["batch", "seq"], + ), + helper.make_tensor_value_info( + "encoder_hidden_states", + TensorProto.FLOAT, + ["batch", "enc_seq_len", self.hidden_size], + ), + helper.make_tensor_value_info( + "use_cache_branch", + TensorProto.BOOL, + ["batch"], + ), + ], + ) + + # Past key-value cache inputs for each layer + for layerIdx in range(self.n_layers): + inputs.extend( + [ + helper.make_tensor_value_info( + f"past_key_values.{layerIdx}.key", + TensorProto.FLOAT, + ["batch", "num_heads", "past_seq_len", self.hidden_size], + ), + helper.make_tensor_value_info( + f"past_key_values.{layerIdx}.value", + TensorProto.FLOAT, + ["batch", "num_heads", "past_seq_len", self.hidden_size], + ), + ], + ) + + return inputs + + def _createDecoderNodes(self) -> list: + """ + Create computation nodes for the decoder model. + + :return: list: List of ONNX nodes defining the decoder computation. + """ + nodes = [] + + # Token embedding lookup + nodes.append( + helper.make_node( + "Gather", + inputs=["embeddingWeights", "input_ids"], + outputs=["token_embeddings"], + axis=0, + ), + ) + + # Process encoder hidden states + nodes.extend(self._createEncoderProcessingNodes()) + + # Process cache branch flag + nodes.extend(self._createCacheProcessingNodes()) + + # Process past key-value pairs + cacheFeatures = self._createCacheFeatureNodes(nodes) + + # Combine all auxiliary features + nodes.extend(self._createFeatureCombinationNodes(cacheFeatures)) + + # Apply main computation pipeline + nodes.extend(self._createMainComputationNodes()) + + return nodes + + def _createEncoderProcessingNodes(self) -> list: + """Create nodes to process encoder hidden states.""" + return [ + # Global average pooling over encoder states + helper.make_node( + "ReduceMean", + inputs=["encoder_hidden_states"], + outputs=["encoder_pooled"], + axes=[1, 2], # Pool over sequence length and hidden dimensions + ), + # Reshape for broadcasting + helper.make_node( + "Reshape", + inputs=["encoder_pooled", "shapeBatch1"], + outputs=["encoder_feature"], + ), + ] + + def _createCacheProcessingNodes(self) -> list: + """Create nodes to process the cache branch flag.""" + return [ + # Convert boolean to float + helper.make_node( + "Cast", + inputs=["use_cache_branch"], + outputs=["cache_flag_float"], + to=TensorProto.FLOAT, + ), + # Reshape for broadcasting + helper.make_node( + "Reshape", + inputs=["cache_flag_float", "shapeBatch1"], + outputs=["cache_flag_feature"], + ), + ] + + def _createCacheFeatureNodes(self, nodes: list) -> list: + """ + Create nodes to process past key-value cache inputs. + + :param nodes (list): List to append new nodes to. + :return: list: Names of cache feature tensors. + """ + cacheFeatures = [] + + for layerIdx in range(self.n_layers): + # Process key cache + nodes.extend( + [ + helper.make_node( + "ReduceMean", + inputs=[f"past_key_values.{layerIdx}.key"], + outputs=[f"cache_key_{layerIdx}_pooled"], + axes=[1, 2, 3], # Global pooling, keep only batch dimension + ), + helper.make_node( + "Reshape", + inputs=[f"cache_key_{layerIdx}_pooled", "shapeBatch1"], + outputs=[f"cache_key_{layerIdx}_feature"], + ), + ], + ) + + # Process value cache + nodes.extend( + [ + helper.make_node( + "ReduceMean", + inputs=[f"past_key_values.{layerIdx}.value"], + outputs=[f"cache_value_{layerIdx}_pooled"], + axes=[1, 2, 3], + ), + helper.make_node( + "Reshape", + inputs=[f"cache_value_{layerIdx}_pooled", "shapeBatch1"], + outputs=[f"cache_value_{layerIdx}_feature"], + ), + ], + ) + + cacheFeatures.extend( + [ + f"cache_key_{layerIdx}_feature", + f"cache_value_{layerIdx}_feature", + ], + ) + + return cacheFeatures + + def _createFeatureCombinationNodes(self, cacheFeatures: list) -> list: + """ + Create nodes to combine all auxiliary features. + + :param cacheFeatures (list): List of cache feature tensor names. + :return: list: Nodes for feature combination. + """ + nodes = [] + allFeatures = ["encoder_feature", "cache_flag_feature"] + cacheFeatures + + # Sequentially add all features together + currentSum = allFeatures[0] + for i, feature in enumerate(allFeatures[1:], 1): + nodes.append( + helper.make_node( + "Add", + inputs=[currentSum, feature], + outputs=[f"combined_features_{i}"], + ), + ) + currentSum = f"combined_features_{i}" + + return nodes + + def _createMainComputationNodes(self) -> list: + """Create the main computation pipeline nodes.""" + finalCombined = f"combined_features_{self.n_layers * 2 + 1}" + + return [ + # Flatten token embeddings + helper.make_node( + "Reshape", + inputs=["token_embeddings", "shape2d"], + outputs=["embeddings_flat"], + ), + # Scale embeddings + helper.make_node( + "Mul", + inputs=["embeddings_flat", "featureScale"], + outputs=["scaled_embeddings"], + ), + # Add auxiliary features (broadcasting) + helper.make_node( + "Add", + inputs=["scaled_embeddings", finalCombined], + outputs=["final_features"], + ), + # Project to vocabulary space + helper.make_node( + "MatMul", + inputs=["final_features", "projectionWeights"], + outputs=["logits_flat"], + ), + # Reshape back to 3D + helper.make_node( + "Reshape", + inputs=["logits_flat", "shape3d"], + outputs=["logits"], + ), + ] + + def _createDecoderConstants(self) -> list: + """ + Create constant tensors needed for decoder computation. + + :returns: list: List of constant tensor initializers. + """ + constants = [] + + # Shape constants for reshaping operations + shape2d = numpy_helper.from_array( + np.array([-1, self.hidden_size], dtype=np.int64), + name="shape2d", + ) + + shape3d = numpy_helper.from_array( + np.array([0, -1, self.vocab_size], dtype=np.int64), + name="shape3d", + ) + + shapeBatch1 = numpy_helper.from_array( + np.array([-1, 1], dtype=np.int64), + name="shapeBatch1", + ) + + # Feature scaling factor + featureScale = numpy_helper.from_array( + np.array([[1.1]], dtype=np.float32), + name="featureScale", + ) + + constants.extend([shape2d, shape3d, shapeBatch1, featureScale]) + + return constants + + def _generateConfigFile(self, outputPath: Path) -> None: + """ + Generate the model configuration JSON file. + + :param outputPath (Path): Output path for the config.json file. + """ + config = self._getModelConfig() + + with open(outputPath, "w", encoding="utf-8") as f: + json.dump(config, f, indent=2, ensure_ascii=False) + + def _getModelConfig(self) -> dict[str, Any]: + """ + Get the complete model configuration dictionary. + + :return: dict[str, Any]: Complete model configuration. + """ + return { + "_name_or_path": "nlpconnect/vit-gpt2-image-captioning", + "architectures": ["VisionEncoderDecoderModel"], + "bos_token_id": 99, + "decoder": self._getDecoderConfig(), + "decoder_start_token_id": 99, + "encoder": self._getEncoderConfig(), + "eos_token_id": 99, + "is_encoder_decoder": True, + "model_type": "vision-encoder-decoder", + "pad_token_id": 99, + "tie_word_embeddings": False, + "transformers_version": "4.33.0.dev0", + } + + def _getDecoderConfig(self) -> dict[str, Any]: + """Get decoder-specific configuration.""" + return { + "_name_or_path": "", + "activation_function": "gelu_new", + "add_cross_attention": True, + "architectures": ["GPT2LMHeadModel"], + "attn_pdrop": 0.1, + "bad_words_ids": None, + "begin_suppress_tokens": None, + "bos_token_id": 99, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": None, + "decoder_start_token_id": 99, + "diversity_penalty": 0.0, + "do_sample": False, + "early_stopping": False, + "embd_pdrop": 0.1, + "encoder_no_repeat_ngram_size": 0, + "eos_token_id": 99, + "exponential_decay_length_penalty": None, + "finetuning_task": None, + "forced_bos_token_id": None, + "forced_eos_token_id": None, + "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, + "initializer_range": 0.02, + "is_decoder": True, + "is_encoder_decoder": False, + "label2id": {"LABEL_0": 0, "LABEL_1": 1}, + "layer_norm_epsilon": 1e-05, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "gpt2", + "n_ctx": 1024, + "n_embd": 768, + "n_head": 12, + "n_inner": None, + "n_layer": 12, + "n_positions": 1024, + "no_repeat_ngram_size": 0, + "num_beam_groups": 1, + "num_beams": 1, + "num_return_sequences": 1, + "output_attentions": False, + "output_hidden_states": False, + "output_scores": False, + "pad_token_id": 99, + "prefix": None, + "problem_type": None, + "pruned_heads": {}, + "remove_invalid_values": False, + "reorder_and_upcast_attn": False, + "repetition_penalty": 1.0, + "resid_pdrop": 0.1, + "return_dict": True, + "return_dict_in_generate": False, + "scale_attn_by_inverse_layer_idx": False, + "scale_attn_weights": True, + "sep_token_id": None, + "summary_activation": None, + "summary_first_dropout": 0.1, + "summary_proj_to_labels": True, + "summary_type": "cls_index", + "summary_use_proj": True, + "suppress_tokens": None, + "task_specific_params": { + "text-generation": { + "do_sample": True, + "max_length": 50, + }, + }, + "temperature": 1.0, + "tf_legacy_loss": False, + "tie_encoder_decoder": False, + "tie_word_embeddings": True, + "tokenizer_class": None, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": None, + "torchscript": False, + "typical_p": 1.0, + "use_bfloat16": False, + "use_cache": True, + "vocab_size": self.vocab_size, + } + + def _getEncoderConfig(self) -> dict[str, Any]: + """Get encoder-specific configuration.""" + return { + "_name_or_path": "", + "add_cross_attention": False, + "architectures": ["ViTModel"], + "attention_probs_dropout_prob": 0.0, + "bad_words_ids": None, + "begin_suppress_tokens": None, + "bos_token_id": None, + "chunk_size_feed_forward": 0, + "cross_attention_hidden_size": None, + "decoder_start_token_id": None, + "diversity_penalty": 0.0, + "do_sample": False, + "early_stopping": False, + "encoder_no_repeat_ngram_size": 0, + "encoder_stride": 16, + "eos_token_id": None, + "exponential_decay_length_penalty": None, + "finetuning_task": None, + "forced_bos_token_id": None, + "forced_eos_token_id": None, + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 768, + "id2label": {"0": "LABEL_0", "1": "LABEL_1"}, + "image_size": self.image_size, + "initializer_range": 0.02, + "intermediate_size": 3072, + "is_decoder": False, + "is_encoder_decoder": False, + "label2id": {"LABEL_0": 0, "LABEL_1": 1}, + "layer_norm_eps": 1e-12, + "length_penalty": 1.0, + "max_length": 20, + "min_length": 0, + "model_type": "vit", + "no_repeat_ngram_size": 0, + "num_attention_heads": 12, + "num_beam_groups": 1, + "num_beams": 1, + "num_channels": self.num_channels, + "num_hidden_layers": 12, + "num_return_sequences": 1, + "output_attentions": False, + "output_hidden_states": False, + "output_scores": False, + "pad_token_id": None, + "patch_size": self.patch_size, + "prefix": None, + "problem_type": None, + "pruned_heads": {}, + "qkv_bias": True, + "remove_invalid_values": False, + "repetition_penalty": 1.0, + "return_dict": True, + "return_dict_in_generate": False, + "sep_token_id": None, + "suppress_tokens": None, + "task_specific_params": None, + "temperature": 1.0, + "tf_legacy_loss": False, + "tie_encoder_decoder": False, + "tie_word_embeddings": True, + "tokenizer_class": None, + "top_k": 50, + "top_p": 1.0, + "torch_dtype": None, + "torchscript": False, + "typical_p": 1.0, + "use_bfloat16": False, + } + + def _generateVocabFile(self, outputPath: Path) -> None: + """ + Generate the vocabulary JSON file. + + :param outputPath: Output path for the vocab.json file. + """ + vocab = self._getVocabulary() + + with open(outputPath, "w", encoding="utf-8") as f: + json.dump(vocab, f, indent=2, ensure_ascii=False) + + def _getVocabulary(self) -> dict[str, int]: + """ + Get the vocabulary mapping dictionary. + + :returns: dict[str, int]: Token to ID mapping. + """ + return { + "<|endoftext|>": 50256, + "<|pad|>": 50257, + "a": 0, + "an": 1, + "the": 2, + "free": 3, + "or": 4, + "but": 5, + "in": 6, + "on": 7, + "at": 8, + "to": 9, + "and": 10, + "of": 11, + "with": 12, + "by": 13, + "man": 14, + "for": 15, + "desk": 16, + "people": 17, + "visual": 18, + "children": 19, + "software": 20, + "girl": 21, + "dog": 22, + "desktop": 23, + "car": 24, + "truck": 25, + "bus": 26, + "bike": 27, + "non-visual": 28, + "NVDA": 29, + "plane": 30, + "boat": 31, + "house": 32, + "access": 33, + "flower": 35, + "microsoft": 36, + "sky": 37, + "cloud": 38, + "sun": 39, + "moon": 40, + "water": 41, + "river": 42, + "ocean": 43, + "red": 44, + "blue": 45, + "reader": 46, + "yellow": 47, + "black": 48, + "white": 49, + "brown": 50, + "orange": 51, + "purple": 52, + "pink": 53, + "!": 54, + "small": 55, + "tall": 56, + "short": 57, + "old": 58, + "young": 59, + "beautiful": 61, + "ugly": 62, + "good": 63, + "bad": 64, + "sitting": 65, + "standing": 66, + "walking": 67, + "running": 68, + "screen": 69, + "drinking": 70, + "playing": 71, + "working": 72, + "is": 73, + "open": 74, + "was": 75, + "were": 76, + "has": 77, + "Best": 78, + "helping": 79, + "will": 80, + "would": 81, + "could": 82, + "should": 83, + "very": 84, + "quite": 85, + "really": 86, + "too": 87, + "also": 88, + "source": 89, + "only": 90, + "even": 91, + "still": 92, + "already": 93, + "windows": 96, + } diff --git a/tests/system/nvdaSettingsFiles/standard-doLoadMockModel.ini b/tests/system/nvdaSettingsFiles/standard-doLoadMockModel.ini new file mode 100644 index 00000000000..eff6d77689c --- /dev/null +++ b/tests/system/nvdaSettingsFiles/standard-doLoadMockModel.ini @@ -0,0 +1,20 @@ +schemaVersion = 2 +[general] + language = en + showWelcomeDialogAtStartup = False +[update] + askedAllowUsageStats = True + autoCheck = False + startupNotification = False + allowUsageStats = False +[speech] + synth = speechSpySynthDriver + unicodeNormalization = DISABLED +[development] + enableScratchpadDir = True +[virtualBuffers] + autoSayAllOnPageLoad = False + passThroughAudioIndication = False +[automatedImageDescriptions] + enable = True + defaultModel = mock/vit-gpt2-image-captioning diff --git a/tests/system/robot/automatedImageDescriptions.py b/tests/system/robot/automatedImageDescriptions.py new file mode 100644 index 00000000000..bfbd6b31a57 --- /dev/null +++ b/tests/system/robot/automatedImageDescriptions.py @@ -0,0 +1,43 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later. +# For more details see: https://www.gnu.org/licenses/gpl-2.0.html + +"""Logic for automatedImageDescriptions tests.""" + +import os +import pathlib + +from ChromeLib import ChromeLib as _ChromeLib +from SystemTestSpy import ( + _getLib, +) +import NvdaLib as _nvdaLib + +_chrome: _ChromeLib = _getLib("ChromeLib") + + +def NVDA_Caption(): + spy = _nvdaLib.getSpyLib() + iconPath = os.path.join( + _nvdaLib._locations.repoRoot, + "source", + "images", + "nvda.ico", + ) + url = pathlib.Path(iconPath).as_uri() + + _chrome.prepareChrome( + f""" +
+ +
+ """, + ) + + # locate graph to generate caption + spy.emulateKeyPress("g") + spy.emulateKeyPress("NVDA+g") + spy.wait_for_specific_speech( + "visual desk access non-visual desktop access non-visual desktop access non-visual desktop access non-visual desktop access non-visual desktop access non-visual", + ) diff --git a/tests/system/robot/automatedImageDescriptions.robot b/tests/system/robot/automatedImageDescriptions.robot new file mode 100644 index 00000000000..6f62bdba917 --- /dev/null +++ b/tests/system/robot/automatedImageDescriptions.robot @@ -0,0 +1,26 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later. +# For more details see: https://www.gnu.org/licenses/gpl-2.0.html +*** Settings *** +Documentation Local captioner tests +Force Tags NVDA smoke test imageDescriptions + +Library NvdaLib.py +Library automatedImageDescriptions.py +Library ScreenCapLibrary + +Test Setup start NVDA standard-doLoadMockModel.ini +Test Teardown default teardown + +*** Keywords *** +default teardown + ${screenshotName}= create_preserved_test_output_filename failedTest.png + Run Keyword If Test Failed Take Screenshot ${screenshotName} + quit NVDA + +*** Test Cases *** +automatedImageDescriptions + [Documentation] Ensure that local captioner work + NVDA_Caption # run test + diff --git a/tests/unit/test_localCaptioner/__init__.py b/tests/unit/test_localCaptioner/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/unit/test_localCaptioner/test_captioner.py b/tests/unit/test_localCaptioner/test_captioner.py new file mode 100644 index 00000000000..4f84a6914f0 --- /dev/null +++ b/tests/unit/test_localCaptioner/test_captioner.py @@ -0,0 +1,345 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt +""" +Unit tests for the VitGpt2ImageCaptioner class. + +This test suite includes comprehensive tests for the VitGpt2ImageCaptioner class, including: +- Initialization +- Configuration loading +- Vocabulary loading +- Image preprocessing +- Encoder and decoder execution +- Text generation +- Exception handling +""" + +import unittest +import json +import os +import tempfile +import numpy as np +from unittest.mock import Mock, patch +from PIL import Image +import io +import shutil + +from _localCaptioner.captioner.vitGpt2 import VitGpt2ImageCaptioner +from _localCaptioner import modelConfig + +modelConfig.initialize() + + +class TestVitGpt2ImageCaptioner(unittest.TestCase): + """Unit tests for the VitGpt2ImageCaptioner class.""" + + def setUp(self): + """Set up test environment.""" + # Create temporary directory and test files + self.testDir = tempfile.mkdtemp() + + # Create test configuration + self.configData = { + "encoder": { + "image_size": 224, + "num_channels": 3, + "patch_size": 16, + "hidden_size": 768, + "num_hidden_layers": 12, + "num_attention_heads": 12, + "intermediate_size": 3072, + }, + "decoder": { + "max_length": 20, + "vocab_size": 50257, + "n_embd": 768, + "n_layer": 12, + "n_head": 12, + "n_ctx": 1024, + "n_positions": 1024, + }, + "bos_token_id": 50256, + "eos_token_id": 50256, + "pad_token_id": 50256, + "generation": { + "do_sample": False, + "num_beams": 1, + "temperature": 1.0, + "top_k": 50, + "top_p": 1.0, + "repetition_penalty": 1.0, + "length_penalty": 1.0, + }, + } + + # Create test vocabulary + self.vocabData = { + "<|endoftext|>": 50256, + "a": 0, + "the": 1, + "cat": 2, + "dog": 3, + "is": 4, + "sitting": 5, + } + + # File paths + self.configPath = os.path.join(self.testDir, "config.json") + self.vocabPath = os.path.join(self.testDir, "vocab.json") + self.encoderPath = "mockEncoder.onnx" + self.decoderPath = "mockDecoder.onnx" + + # Write config and vocab files + with open(self.configPath, "w", encoding="utf-8") as f: + json.dump(self.configData, f) + with open(self.vocabPath, "w", encoding="utf-8") as f: + json.dump(self.vocabData, f) + + def tearDown(self): + """Clean up temporary files.""" + shutil.rmtree(self.testDir) + + @patch("onnxruntime.InferenceSession") + def test_initSuccess(self, mockSession): + """Test successful initialization.""" + mockEncoder = Mock() + mockDecoder = Mock() + mockSession.side_effect = [mockEncoder, mockDecoder] + + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + + self.assertEqual(captioner.decoderConfig.max_length, 20) + self.assertEqual(captioner.modelConfig.bos_token_id, 50256) + self.assertEqual(captioner.vocabSize, len(self.vocabData)) + self.assertEqual(mockSession.call_count, 2) + + def test_initConfigNotFound(self): + """Test missing config file raises error.""" + with self.assertRaises(FileNotFoundError) as context: + VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath="nonexistentConfig.json", + ) + self.assertIn("config file", str(context.exception)) + + @patch("onnxruntime.InferenceSession") + def test_loadVocabSuccess(self, mockSession): + """Test vocabulary loads successfully.""" + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + expectedVocab = {v: k for k, v in self.vocabData.items()} + self.assertEqual(captioner.vocab, expectedVocab) + + @patch("onnxruntime.InferenceSession") + def test_preprocessImageFromPath(self, mockSession): + """Test preprocessing image from file path.""" + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + testImage = Image.new("RGB", (100, 100), color="red") + testImagePath = os.path.join(self.testDir, "testImage.jpg") + testImage.save(testImagePath) + + result = captioner._preprocessImage(testImagePath) + + self.assertEqual(result.shape, (1, 3, 224, 224)) + self.assertEqual(result.dtype, np.float32) + + @patch("onnxruntime.InferenceSession") + def test_preprocessImageFromBytes(self, mockSession): + """Test preprocessing image from byte input.""" + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + testImage = Image.new("RGB", (100, 100), color="blue") + imgBytes = io.BytesIO() + testImage.save(imgBytes, format="PNG") + imgBytes = imgBytes.getvalue() + + result = captioner._preprocessImage(imgBytes) + + self.assertEqual(result.shape, (1, 3, 224, 224)) + self.assertEqual(result.dtype, np.float32) + + @patch("onnxruntime.InferenceSession") + def test_encodeImage(self, mockSession): + """Test image encoding using encoder.""" + mockEncoder = Mock() + mockDecoder = Mock() + mockEncoderOutput = np.random.randn(1, 196, 768).astype(np.float32) + mockEncoder.run.return_value = [mockEncoderOutput] + mockEncoder.get_inputs.return_value = [Mock(name="pixel_values")] + + mockSession.side_effect = [mockEncoder, mockDecoder] + + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + + testInput = np.random.randn(1, 3, 224, 224).astype(np.float32) + result = captioner._encodeImage(testInput) + + np.testing.assert_array_equal(result, mockEncoderOutput) + mockEncoder.run.assert_called_once() + + @patch("onnxruntime.InferenceSession") + def test_decodeTokens(self, mockSession): + """Test decoding tokens to text.""" + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + tokenIds = [1, 2, 4, 5] + result = captioner._decodeTokens(tokenIds) + expected = "the cat is sitting" + self.assertEqual(result, expected) + + @patch("onnxruntime.InferenceSession") + def test_decodeTokensWithSpecialTokens(self, mockSession): + """Test decoding tokens with special tokens removed.""" + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + tokenIds = [50256, 1, 2, 50256] + result = captioner._decodeTokens(tokenIds) + expected = "the cat" + self.assertEqual(result, expected) + + @patch("onnxruntime.InferenceSession") + def test_initializePastKeyValues(self, mockSession): + """Test initialization of past key values.""" + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + pastKv = captioner._initializePastKeyValues(batchSize=1) + expectedCount = captioner.decoderConfig.n_layer * 2 + self.assertEqual(len(pastKv), expectedCount) + + for layerIdx in range(captioner.decoderConfig.n_layer): + keyName = f"past_key_values.{layerIdx}.key" + valueName = f"past_key_values.{layerIdx}.value" + self.assertIn(keyName, pastKv) + self.assertIn(valueName, pastKv) + expectedShape = ( + 1, + captioner.decoderConfig.n_head, + 0, + captioner.decoderConfig.n_embd // captioner.decoderConfig.n_head, + ) + self.assertEqual(pastKv[keyName].shape, expectedShape) + self.assertEqual(pastKv[valueName].shape, expectedShape) + + @patch("onnxruntime.InferenceSession") + def test_generateWithGreedyMock(self, mockSession): + """Test greedy generation with mocked outputs.""" + mockEncoder = Mock() + mockDecoder = Mock() + + mockDecoder.get_inputs.return_value = [ + Mock(name="input_ids"), + Mock(name="encoder_hidden_states"), + Mock(name="use_cache_branch"), + ] + + logits_1 = np.zeros((1, 1, 50257)) + logits_1[0, 0, 2] = 10.0 + + logits_2 = np.zeros((1, 1, 50257)) + logits_2[0, 0, 50256] = 10.0 + + mockDecoder.run.side_effect = [[logits_1], [logits_2]] + mockSession.side_effect = [mockEncoder, mockDecoder] + + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + encoderStates = np.random.randn(1, 196, 768).astype(np.float32) + result = captioner._generateWithGreedy(encoderStates, maxLength=5) + self.assertEqual(result, "cat") + + @patch("onnxruntime.InferenceSession") + def test_getDecoderInfo(self, mockSession): + """Test retrieving decoder input/output names.""" + mockEncoder = Mock() + mockDecoder = Mock() + mockInput = Mock() + mockInput.name = "input_ids" + mockOutput = Mock() + mockOutput.name = "logits" + + mockDecoder.get_inputs.return_value = [mockInput] + mockDecoder.get_outputs.return_value = [mockOutput] + mockSession.side_effect = [mockEncoder, mockDecoder] + + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + + inputNames = captioner._getDecoderInputNames() + self.assertEqual(inputNames, ["input_ids"]) + + outputNames = captioner._getDecoderOutputNames() + self.assertEqual(outputNames, ["logits"]) + + @patch("onnxruntime.InferenceSession") + @patch.object(VitGpt2ImageCaptioner, "_preprocessImage") + @patch.object(VitGpt2ImageCaptioner, "_encodeImage") + @patch.object(VitGpt2ImageCaptioner, "_generateWithGreedy") + def test_generateCaptionIntegration(self, mockGreedy, mockEncode, mockPreprocess, mockSession): + """Test full caption generation pipeline integration.""" + mockPreprocess.return_value = np.random.randn(1, 3, 224, 224) + mockEncode.return_value = np.random.randn(1, 196, 768) + mockGreedy.return_value = "a cat sitting on a table" + + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + + result = captioner.generateCaption("testImage.jpg") + + mockPreprocess.assert_called_once_with("testImage.jpg") + mockEncode.assert_called_once() + mockGreedy.assert_called_once() + self.assertEqual(result, "a cat sitting on a table") + + @patch("onnxruntime.InferenceSession") + def test_configParameterLoading(self, mockSession): + """Test full config parameter parsing.""" + captioner = VitGpt2ImageCaptioner( + encoderPath=self.encoderPath, + decoderPath=self.decoderPath, + configPath=self.configPath, + ) + self.assertEqual(captioner.encoderConfig.num_channels, 3) + self.assertEqual(captioner.decoderConfig.max_length, 20) + self.assertEqual(captioner.decoderConfig.n_embd, 768) + self.assertEqual(captioner.decoderConfig.n_layer, 12) + self.assertEqual(captioner.modelConfig.bos_token_id, 50256) + self.assertEqual(captioner.modelConfig.eos_token_id, 50256) + self.assertEqual(captioner.modelConfig.pad_token_id, 50256) diff --git a/tests/unit/test_localCaptioner/test_downloader.py b/tests/unit/test_localCaptioner/test_downloader.py new file mode 100644 index 00000000000..f3022a8c2da --- /dev/null +++ b/tests/unit/test_localCaptioner/test_downloader.py @@ -0,0 +1,108 @@ +# A part of NonVisual Desktop Access (NVDA) +# Copyright (C) 2025 NV Access Limited, Tianze +# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license. +# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt +""" +Unit tests for the ModelDownloader class. + +Covers: +- Directory creation +- URL construction +- Remote file size detection (HEAD and Range requests) +- Progress reporting logic +- File download success/failure +- Multi-threaded download success/failure +- Cancellation handling +- Model file path building +- downloadDefaultModel user prompt flow +""" + +import tempfile +import unittest +from unittest.mock import patch +from typing import Any + +# Import the class and function under test +from _localCaptioner.modelDownloader import ModelDownloader + + +class TestModelDownloader(unittest.TestCase): + """Unit tests for ModelDownloader.""" + + def setUp(self): + # No longer passing basePath during initialization + self.tempDir = tempfile.mkdtemp() + self.downloader = ModelDownloader() + + @patch("pathlib.Path.mkdir") + def test_ensureModelsDirectory_success(self, mockMkdir): + """Ensure directory is created and correct path returned.""" + mockMkdir.return_value = None + modelsDir = self.downloader.ensureModelsDirectory() + self.assertTrue(modelsDir.endswith("vit-gpt2-image-captioning")) + mockMkdir.assert_called_once() + + @patch("pathlib.Path.mkdir", side_effect=OSError("Permission denied")) + def test_ensureModelsDirectory_failure(self, mockMkdir): + """Ensure OSError is raised when models directory cannot be created.""" + with self.assertRaises(OSError): + self.downloader.ensureModelsDirectory() + + def test_constructDownloadUrlDefaultHost(self): + """Construct URL when remoteHost has no scheme.""" + url = self.downloader.constructDownloadUrl("foo/bar", "file.txt") + self.assertTrue(url.startswith("https://huggingface.co/foo/bar")) + + def test_constructDownloadUrlWithHttpHost(self): + """Construct URL when remoteHost already contains http://.""" + self.downloader.remoteHost = "http://example.com" + url = self.downloader.constructDownloadUrl("foo", "bar") + self.assertEqual(url, "http://example.com/foo/resolve/main/bar") + + def test_reportProgressTriggersCallback(self) -> None: + """Test that callback is triggered when downloaded bytes exceed threshold.""" + callbackData: dict[str, Any] = {} + + def progressCallback( + fileName: str, + downloadedBytes: int, + totalBytes: int, + percentage: float, + ) -> None: + """Callback function to capture progress data.""" + callbackData["fileName"] = fileName + callbackData["downloadedBytes"] = downloadedBytes + + # Test with download size exceeding 1MB threshold + downloadedSize = 1024 * 1024 + 1 # 1MB + 1 byte + totalSize = 2 * 1024 * 1024 # 2MB + initialTime = 0 + + lastReportedTime = self.downloader._reportProgress( + progressCallback, + "test_file.zip", + downloadedSize, + totalSize, + initialTime, + ) + + # Assertions + self.assertEqual(callbackData["fileName"], "test_file.zip") + self.assertEqual(callbackData["downloadedBytes"], downloadedSize) + self.assertGreater(lastReportedTime, initialTime) + + @patch.object(ModelDownloader, "downloadSingleFile", return_value=(True, "ok")) + def test_downloadModelsMultithreadedAllSuccess(self, mockSingle): + """All files are downloaded successfully.""" + files = ["a.txt", "b.txt"] + success, failed = self.downloader.downloadModelsMultithreaded(self.tempDir, "model", files) + self.assertEqual(len(success), 2) + self.assertEqual(len(failed), 0) + + @patch.object(ModelDownloader, "downloadSingleFile", side_effect=[(True, "ok"), (False, "err")]) + def test_downloadModelsMultithreadedPartialFailure(self, mockSingle): + """One file succeeds and one fails.""" + files = ["a.txt", "b.txt"] + success, failed = self.downloader.downloadModelsMultithreaded(self.tempDir, "model", files) + self.assertEqual(len(success), 1) + self.assertEqual(len(failed), 1) diff --git a/user_docs/en/changes.md b/user_docs/en/changes.md index 50f8755e9ef..06e45d75731 100644 --- a/user_docs/en/changes.md +++ b/user_docs/en/changes.md @@ -32,6 +32,8 @@ Please refer to [the developer guide](https://download.nvaccess.org/documentatio ## 2026.1 This release includes built-in support for reading math content with MathCAT. +It also introduces experimental, on-device AI image descriptions. +You can now use `NVDA+g` to get a short, approximate description of images you encounter, without any data leaving your device. There have been several improvements to speech. Spelling errors can now be reported with a sound instead of speech when reading. @@ -85,6 +87,12 @@ Windows 10 on ARM is also no longer supported. An action has been added to view the full scan results on the VirusTotal website. (#18974) * A new action has been added to see the latest changes for the current version of an add-on. (#14041, @josephsl, @nvdaes) * Added built-in support for reading math content by integrating MathCAT. (#18323, #19368, @RyanMcCleary, @codeofdusk) +* NVDA can now use on-device AI to generate image descriptions. (#18475, @tianzeshi-study) + * This feature is experimental, and should not be used in situations where inaccurate descriptions could cause harm. + * To use this feature, NVDA will need to download image description data. + Thereafter, it operates entirely offline. + * Press `NVDA+g` to get an AI generated image description. + * Unassigned commands are available to quickly open the settings dialog to the "AI Image Descriptions" category, and toggle image captioning. * Added references (e.g. to footnotes and endnotes) to the elements list in Microsoft Word. Also added unassigned Quick Navigation commands to jump to the next/previous reference. (#19300, @LeonarddeR) * In browse mode, the number of items in a list is now reported in braille. (#7455, @nvdaes) diff --git a/user_docs/en/userGuide.md b/user_docs/en/userGuide.md index b3d375958ae..7538bde95b8 100644 --- a/user_docs/en/userGuide.md +++ b/user_docs/en/userGuide.md @@ -1503,7 +1503,7 @@ You can enable Screen Curtain in the [Privacy and Security category](#PrivacyAnd When Screen Curtain is enabled, features that rely on what is literally on screen will not function. -For example, you cannot [use OCR](#Win10Ocr). +For example, you cannot [use OCR](#Win10Ocr) or [get AI image descriptions](#LocalCaptioner). Some screenshot utilities also may not work. Please note that while Windows Magnifier is running and inverted screen colors are being used, Screen Curtain cannot be enabled. @@ -3557,6 +3557,15 @@ You will be asked to confirm before all trusted fingerprints are deleted. This option is only available if there are trusted fingerprints stored in your configuration. +#### AI Image Descriptions Settings {#LocalCaptionerSettings} + +This panel provides options to customize the behavior and default settings for the ["Image Captioner"](#LocalCaptioner). + +##### Enable image captioner {#LocalCaptionToggle} + +When this checkbox is enabled, NVDA will load the image captioner in memory, enabling the use of the image description command. +Loading the image captioner will increase memory usage, so this is disabled by default. + #### Windows OCR Settings {#Win10OcrSettings} The settings in this category allow you to configure [Windows OCR](#Win10Ocr). @@ -4178,6 +4187,36 @@ Once a Remote Access session is active, you can switch between controlling the r | Send `control+alt+delete` | None | Sends `control+alt+delete` to the controlled computer. | +## Image Captioner {#LocalCaptioner} + +NVDA supports generating image descriptions on your device without connecting to the internet. +This feature allows NVDA to describe images encountered during navigation. + +Warning: AI image descriptions are an experimental feature. +Image descriptions generated with this feature may not be accurate. +You must not use this feature in circumstances where inaccurate results could reasonably be expected to cause harm. +Always exercise caution and skepticism when interpreting AI image descriptions. + +Note: An internet connection is required to enable and install the Image Captioner for the first time. +It is not included with the NVDA installer to reduce the installer size. + +### Getting Started {#LocalCaptionerGettingStarted} + +Enable the "Image Captioner" in the ["AI Image Descriptions" settings panel](#LocalCaptionToggle). +Once the Image Captioner is ready, press the default shortcut `NVDA+g` to recognize the image currently navigated by NVDA. + +### AI Image Descriptions Key Commands Summary {#LocalCaptionerGestures} + + + +| Name |Key |Description| +|---|---|---| +| Get an AI-generated image description of the navigator object. | `NVDA+g` | Get a description of the navigator object provided by a recognition performed on the device locally. | +| Load or unload the image captioner | None | Load or unload the image captioner in memory, enabling the use of the image description command. | +| Shows the AI image descriptions settings | None | Opens the AI image descriptions settings panel. | + + + ## Add-ons and the Add-on Store {#AddonsManager} Add-ons are software packages which provide new or altered functionality for NVDA. diff --git a/uv.lock b/uv.lock index fdba75a9de5..c35c576b914 100644 --- a/uv.lock +++ b/uv.lock @@ -62,11 +62,11 @@ wheels = [ [[package]] name = "cachetools" -version = "6.2.4" +version = "6.2.2" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/bc/1d/ede8680603f6016887c062a2cf4fc8fdba905866a3ab8831aa8aa651320c/cachetools-6.2.4.tar.gz", hash = "sha256:82c5c05585e70b6ba2d3ae09ea60b79548872185d2f24ae1f2709d37299fd607", size = 31731, upload-time = "2025-12-15T18:24:53.744Z" } +sdist = { url = "https://files.pythonhosted.org/packages/fb/44/ca1675be2a83aeee1886ab745b28cda92093066590233cc501890eb8417a/cachetools-6.2.2.tar.gz", hash = "sha256:8e6d266b25e539df852251cfd6f990b4bc3a141db73b939058d809ebd2590fc6", size = 31571, upload-time = "2025-11-13T17:42:51.465Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/fc/1d7b80d0eb7b714984ce40efc78859c022cd930e402f599d8ca9e39c78a4/cachetools-6.2.4-py3-none-any.whl", hash = "sha256:69a7a52634fed8b8bf6e24a050fb60bff1c9bd8f6d24572b99c32d4e71e62a51", size = 11551, upload-time = "2025-12-15T18:24:52.332Z" }, + { url = "https://files.pythonhosted.org/packages/e6/46/eb6eca305c77a4489affe1c5d8f4cae82f285d9addd8de4ec084a7184221/cachetools-6.2.2-py3-none-any.whl", hash = "sha256:6c09c98183bf58560c97b2abfcedcbaf6a896a490f534b031b661d3723b45ace", size = 11503, upload-time = "2025-11-13T17:42:50.232Z" }, ] [[package]] @@ -83,11 +83,11 @@ wheels = [ [[package]] name = "certifi" -version = "2026.1.4" +version = "2025.11.12" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a2/8c/58f469717fa48465e4a50c014a0400602d3c437d7c0c468e17ada824da3a/certifi-2025.11.12.tar.gz", hash = "sha256:d8ab5478f2ecd78af242878415affce761ca6bc54a22a27e026d7c25357c3316", size = 160538, upload-time = "2025-11-12T02:54:51.517Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" }, + { url = "https://files.pythonhosted.org/packages/70/7d/9bc192684cea499815ff478dfcdc13835ddf401365057044fb721ec6bddb/certifi-2025.11.12-py3-none-any.whl", hash = "sha256:97de8790030bbd5c2d96b7ec782fc2f7820ef8dba6db909ccf95449f2d062d4b", size = 159438, upload-time = "2025-11-12T02:54:49.735Z" }, ] [[package]] @@ -134,6 +134,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "coloredlogs" +version = "15.0.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "humanfriendly", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/c7/eed8f27100517e8c0e6b923d5f0845d0cb99763da6fdee00478f91db7325/coloredlogs-15.0.1.tar.gz", hash = "sha256:7c991aa71a4577af2f82600d8f8f3a89f936baeaf9b50a9c197da014e5bf16b0", size = 278520, upload-time = "2021-06-11T10:22:45.202Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/06/3d6badcf13db419e25b07041d9c7b4a2c331d3f4e7134445ec5df57714cd/coloredlogs-15.0.1-py2.py3-none-any.whl", hash = "sha256:612ee75c546f53e92e70049c9dbfcc18c935a2b9a53b66085ce9ef6a6e5c0934", size = 46018, upload-time = "2021-06-11T10:22:42.561Z" }, +] + [[package]] name = "comtypes" version = "1.4.13" @@ -229,11 +241,32 @@ wheels = [ [[package]] name = "filelock" -version = "3.20.2" +version = "3.20.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/58/46/0028a82567109b5ef6e4d2a1f04a583fb513e6cf9527fcdd09afd817deeb/filelock-3.20.0.tar.gz", hash = "sha256:711e943b4ec6be42e1d4e6690b48dc175c822967466bb31c0c293f34334c13f4", size = 18922, upload-time = "2025-10-08T18:03:50.056Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/76/91/7216b27286936c16f5b4d0c530087e4a54eead683e6b0b73dd0c64844af6/filelock-3.20.0-py3-none-any.whl", hash = "sha256:339b4732ffda5cd79b13f4e2711a31b0365ce445d95d243bb996273d072546a2", size = 16054, upload-time = "2025-10-08T18:03:48.35Z" }, +] + +[[package]] +name = "flatbuffers" +version = "25.9.23" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9d/1f/3ee70b0a55137442038f2a33469cc5fddd7e0ad2abf83d7497c18a2b6923/flatbuffers-25.9.23.tar.gz", hash = "sha256:676f9fa62750bb50cf531b42a0a2a118ad8f7f797a511eda12881c016f093b12", size = 22067, upload-time = "2025-09-24T05:25:30.106Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ee/1b/00a78aa2e8fbd63f9af08c9c19e6deb3d5d66b4dda677a0f61654680ee89/flatbuffers-25.9.23-py2.py3-none-any.whl", hash = "sha256:255538574d6cb6d0a79a17ec8bc0d30985913b87513a01cce8bcdb6b4c44d0e2", size = 30869, upload-time = "2025-09-24T05:25:28.912Z" }, +] + +[[package]] +name = "humanfriendly" +version = "10.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c1/e0/a75dbe4bca1e7d41307323dad5ea2efdd95408f74ab2de8bd7dba9b51a1a/filelock-3.20.2.tar.gz", hash = "sha256:a2241ff4ddde2a7cebddf78e39832509cb045d18ec1a09d7248d6bfc6bfbbe64", size = 19510, upload-time = "2026-01-02T15:33:32.582Z" } +dependencies = [ + { name = "pyreadline3", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/cc/3f/2c29224acb2e2df4d2046e4c73ee2662023c58ff5b113c4c1adac0886c43/humanfriendly-10.0.tar.gz", hash = "sha256:6b0b831ce8f15f7300721aa49829fc4e83921a9a301cc7f606be6686a2288ddc", size = 360702, upload-time = "2021-09-17T21:40:43.31Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/9a/30/ab407e2ec752aa541704ed8f93c11e2a5d92c168b8a755d818b74a3c5c2d/filelock-3.20.2-py3-none-any.whl", hash = "sha256:fbba7237d6ea277175a32c54bb71ef814a8546d8601269e1bfc388de333974e8", size = 16697, upload-time = "2026-01-02T15:33:31.133Z" }, + { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" }, ] [[package]] @@ -449,6 +482,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/9e/dcd1027f7fd193aed152e01c6651a197c36b858f2cd1425ad04cb31a34fc/mdx_truly_sane_lists-1.3-py3-none-any.whl", hash = "sha256:b9546a4c40ff8f1ab692f77cee4b6bfe8ddf9cccf23f0a24e71f3716fe290a37", size = 6071, upload-time = "2022-07-19T13:42:43.375Z" }, ] +[[package]] +name = "ml-dtypes" +version = "0.5.4" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "numpy", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314, upload-time = "2025-11-17T22:32:31.031Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e1/8b/200088c6859d8221454825959df35b5244fa9bdf263fd0249ac5fb75e281/ml_dtypes-0.5.4-cp313-cp313-win_amd64.whl", hash = "sha256:f21c9219ef48ca5ee78402d5cc831bd58ea27ce89beda894428bc67a52da5328", size = 212224, upload-time = "2025-11-17T22:32:01.349Z" }, + { url = "https://files.pythonhosted.org/packages/8f/75/dfc3775cb36367816e678f69a7843f6f03bd4e2bcd79941e01ea960a068e/ml_dtypes-0.5.4-cp313-cp313-win_arm64.whl", hash = "sha256:35f29491a3e478407f7047b8a4834e4640a77d2737e0b294d049746507af5175", size = 160798, upload-time = "2025-11-17T22:32:02.864Z" }, + { url = "https://files.pythonhosted.org/packages/8c/27/12607423d0a9c6bbbcc780ad19f1f6baa2b68b18ce4bddcdc122c4c68dc9/ml_dtypes-0.5.4-cp313-cp313t-win_amd64.whl", hash = "sha256:cb73dccfc991691c444acc8c0012bee8f2470da826a92e3a20bb333b1a7894e6", size = 225612, upload-time = "2025-11-17T22:32:08.615Z" }, + { url = "https://files.pythonhosted.org/packages/e5/80/5a5929e92c72936d5b19872c5fb8fc09327c1da67b3b68c6a13139e77e20/ml_dtypes-0.5.4-cp313-cp313t-win_arm64.whl", hash = "sha256:3bbbe120b915090d9dd1375e4684dd17a20a2491ef25d640a908281da85e73f1", size = 164145, upload-time = "2025-11-17T22:32:09.782Z" }, +] + [[package]] name = "mouseinfo" version = "0.1.3" @@ -458,6 +506,15 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/28/fa/b2ba8229b9381e8f6381c1dcae6f4159a7f72349e414ed19cfbbd1817173/MouseInfo-0.1.3.tar.gz", hash = "sha256:2c62fb8885062b8e520a3cce0a297c657adcc08c60952eb05bc8256ef6f7f6e7", size = 10850, upload-time = "2020-03-27T21:20:10.136Z" } +[[package]] +name = "mpmath" +version = "1.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106, upload-time = "2023-03-07T16:47:11.061Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198, upload-time = "2023-03-07T16:47:09.197Z" }, +] + [[package]] name = "mss" version = "10.1.0" @@ -480,33 +537,35 @@ wheels = [ [[package]] name = "nodeenv" -version = "1.10.0" +version = "1.9.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/24/bf/d1bda4f6168e0b2e9e5958945e01910052158313224ada5ce1fb2e1113b8/nodeenv-1.10.0.tar.gz", hash = "sha256:996c191ad80897d076bdfba80a41994c2b47c68e224c542b48feba42ba00f8bb", size = 55611, upload-time = "2025-12-20T14:08:54.006Z" } +sdist = { url = "https://files.pythonhosted.org/packages/43/16/fc88b08840de0e0a72a2f9d8c6bae36be573e475a6326ae854bcc549fc45/nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f", size = 47437, upload-time = "2024-06-04T18:44:11.171Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/88/b2/d0896bdcdc8d28a7fc5717c305f1a861c26e18c05047949fb371034d98bd/nodeenv-1.10.0-py2.py3-none-any.whl", hash = "sha256:5bb13e3eed2923615535339b3c620e76779af4cb4c6a90deccc9e36b274d3827", size = 23438, upload-time = "2025-12-20T14:08:52.782Z" }, + { url = "https://files.pythonhosted.org/packages/d2/1d/1b658dbd2b9fa9c4c9f32accbfc0205d532c8c6194dc0f2a4c0428e7128a/nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9", size = 22314, upload-time = "2024-06-04T18:44:08.352Z" }, ] [[package]] name = "nodejs-wheel-binaries" -version = "24.12.0" +version = "24.11.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b9/35/d806c2ca66072e36dc340ccdbeb2af7e4f1b5bcc33f1481f00ceed476708/nodejs_wheel_binaries-24.12.0.tar.gz", hash = "sha256:f1b50aa25375e264697dec04b232474906b997c2630c8f499f4caf3692938435", size = 8058, upload-time = "2025-12-11T21:12:26.856Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e4/89/da307731fdbb05a5f640b26de5b8ac0dc463fef059162accfc89e32f73bc/nodejs_wheel_binaries-24.11.1.tar.gz", hash = "sha256:413dfffeadfb91edb4d8256545dea797c237bba9b3faefea973cde92d96bb922", size = 8059, upload-time = "2025-11-18T18:21:58.207Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/6a/9d/c6492188ce8de90093c6755a4a63bb6b2b4efb17094cb4f9a9a49c73ed3b/nodejs_wheel_binaries-24.12.0-py2.py3-none-win_amd64.whl", hash = "sha256:2090d59f75a68079fabc9b86b14df8238b9aecb9577966dc142ce2a23a32e9bb", size = 41342076, upload-time = "2025-12-11T21:12:20.618Z" }, - { url = "https://files.pythonhosted.org/packages/df/af/cd3290a647df567645353feed451ef4feaf5844496ced69c4dcb84295ff4/nodejs_wheel_binaries-24.12.0-py2.py3-none-win_arm64.whl", hash = "sha256:d0c2273b667dd7e3f55e369c0085957b702144b1b04bfceb7ce2411e58333757", size = 39048104, upload-time = "2025-12-11T21:12:23.495Z" }, + { url = "https://files.pythonhosted.org/packages/4d/1c/2fb05127102a80225cab7a75c0e9edf88a0a1b79f912e1e36c7c1aaa8f4e/nodejs_wheel_binaries-24.11.1-py2.py3-none-win_amd64.whl", hash = "sha256:10197b1c9c04d79403501766f76508b0dac101ab34371ef8a46fcf51773497d0", size = 41322308, upload-time = "2025-11-18T18:21:51.347Z" }, + { url = "https://files.pythonhosted.org/packages/ad/b7/bc0cdbc2cc3a66fcac82c79912e135a0110b37b790a14c477f18e18d90cd/nodejs_wheel_binaries-24.11.1-py2.py3-none-win_arm64.whl", hash = "sha256:376b9ea1c4bc1207878975dfeb604f7aa5668c260c6154dcd2af9d42f7734116", size = 39026497, upload-time = "2025-11-18T18:21:54.634Z" }, ] [[package]] name = "numpy" -version = "2.2.6" +version = "2.3.5" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/76/21/7d2a95e4bba9dc13d043ee156a356c0a8f0c6309dff6b21b4d71a073b8a8/numpy-2.2.6.tar.gz", hash = "sha256:e29554e2bef54a90aa5cc07da6ce955accb83f21ab5de01a62c8478897b264fd", size = 20276440, upload-time = "2025-05-17T22:38:04.611Z" } +sdist = { url = "https://files.pythonhosted.org/packages/76/65/21b3bc86aac7b8f2862db1e808f1ea22b028e30a225a34a5ede9bf8678f2/numpy-2.3.5.tar.gz", hash = "sha256:784db1dcdab56bf0517743e746dfb0f885fc68d948aba86eeec2cba234bdf1c0", size = 20584950, upload-time = "2025-11-16T22:52:42.067Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f0/3b/5cba2b1d88760ef86596ad0f3d484b1cbff7c115ae2429678465057c5155/numpy-2.2.6-cp313-cp313-win32.whl", hash = "sha256:5beb72339d9d4fa36522fc63802f469b13cdbe4fdab4a288f0c441b74272ebfd", size = 6244532, upload-time = "2025-05-17T21:43:46.099Z" }, - { url = "https://files.pythonhosted.org/packages/cb/3b/d58c12eafcb298d4e6d0d40216866ab15f59e55d148a5658bb3132311fcf/numpy-2.2.6-cp313-cp313-win_amd64.whl", hash = "sha256:b0544343a702fa80c95ad5d3d608ea3599dd54d4632df855e4c8d24eb6ecfa1c", size = 12610885, upload-time = "2025-05-17T21:44:05.145Z" }, - { url = "https://files.pythonhosted.org/packages/09/04/f2f83279d287407cf36a7a8053a5abe7be3622a4363337338f2585e4afda/numpy-2.2.6-cp313-cp313t-win32.whl", hash = "sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff", size = 6377225, upload-time = "2025-05-17T21:43:16.254Z" }, - { url = "https://files.pythonhosted.org/packages/67/0e/35082d13c09c02c011cf21570543d202ad929d961c02a147493cb0c2bdf5/numpy-2.2.6-cp313-cp313t-win_amd64.whl", hash = "sha256:6031dd6dfecc0cf9f668681a37648373bddd6421fff6c66ec1624eed0180ee06", size = 12771374, upload-time = "2025-05-17T21:43:35.479Z" }, + { url = "https://files.pythonhosted.org/packages/40/56/2932d75b6f13465239e3b7b7e511be27f1b8161ca2510854f0b6e521c395/numpy-2.3.5-cp313-cp313-win32.whl", hash = "sha256:1978155dd49972084bd6ef388d66ab70f0c323ddee6f693d539376498720fb7e", size = 6277637, upload-time = "2025-11-16T22:50:40.11Z" }, + { url = "https://files.pythonhosted.org/packages/0c/88/e2eaa6cffb115b85ed7c7c87775cb8bcf0816816bc98ca8dbfa2ee33fe6e/numpy-2.3.5-cp313-cp313-win_amd64.whl", hash = "sha256:00dc4e846108a382c5869e77c6ed514394bdeb3403461d25a829711041217d5b", size = 12779090, upload-time = "2025-11-16T22:50:42.503Z" }, + { url = "https://files.pythonhosted.org/packages/8f/88/3f41e13a44ebd4034ee17baa384acac29ba6a4fcc2aca95f6f08ca0447d1/numpy-2.3.5-cp313-cp313-win_arm64.whl", hash = "sha256:0472f11f6ec23a74a906a00b48a4dcf3849209696dff7c189714511268d103ae", size = 10194710, upload-time = "2025-11-16T22:50:44.971Z" }, + { url = "https://files.pythonhosted.org/packages/80/e9/aff53abbdd41b0ecca94285f325aff42357c6b5abc482a3fcb4994290b18/numpy-2.3.5-cp313-cp313t-win32.whl", hash = "sha256:70b37199913c1bd300ff6e2693316c6f869c7ee16378faf10e4f5e3275b299c3", size = 6405940, upload-time = "2025-11-16T22:51:11.541Z" }, + { url = "https://files.pythonhosted.org/packages/d5/81/50613fec9d4de5480de18d4f8ef59ad7e344d497edbef3cfd80f24f98461/numpy-2.3.5-cp313-cp313t-win_amd64.whl", hash = "sha256:b501b5fa195cc9e24fe102f21ec0a44dffc231d2af79950b451e0d99cea02234", size = 12920341, upload-time = "2025-11-16T22:51:14.312Z" }, + { url = "https://files.pythonhosted.org/packages/bb/ab/08fd63b9a74303947f34f0bd7c5903b9c5532c2d287bead5bdf4c556c486/numpy-2.3.5-cp313-cp313t-win_arm64.whl", hash = "sha256:a80afd79f45f3c4a7d341f13acbe058d1ca8ac017c165d3fa0d3de6bc1a079d7", size = 10262507, upload-time = "2025-11-16T22:51:16.846Z" }, ] [[package]] @@ -525,6 +584,8 @@ dependencies = [ { name = "mdx-gh-links", marker = "sys_platform == 'win32'" }, { name = "mdx-truly-sane-lists", marker = "sys_platform == 'win32'" }, { name = "nh3", marker = "sys_platform == 'win32'" }, + { name = "numpy", marker = "sys_platform == 'win32'" }, + { name = "onnxruntime", marker = "sys_platform == 'win32'" }, { name = "pycaw", marker = "sys_platform == 'win32'" }, { name = "pymdown-extensions", marker = "sys_platform == 'win32'" }, { name = "pyserial", marker = "sys_platform == 'win32'" }, @@ -557,6 +618,7 @@ lint = [ { name = "ruff", marker = "sys_platform == 'win32'" }, ] system-tests = [ + { name = "onnx", marker = "sys_platform == 'win32'" }, { name = "robotframework", marker = "sys_platform == 'win32'" }, { name = "robotframework-screencaplibrary", marker = "sys_platform == 'win32'" }, { name = "robotremoteserver", marker = "sys_platform == 'win32'" }, @@ -580,6 +642,8 @@ requires-dist = [ { name = "mdx-gh-links", specifier = "==0.4" }, { name = "mdx-truly-sane-lists", specifier = "==1.3" }, { name = "nh3", specifier = "==0.3.2" }, + { name = "numpy", specifier = "==2.3.5" }, + { name = "onnxruntime", specifier = "==1.23.2" }, { name = "pycaw", specifier = "==20251023" }, { name = "pymdown-extensions", specifier = "==10.17.1" }, { name = "pyserial", specifier = "==3.5" }, @@ -610,6 +674,7 @@ lint = [ { name = "ruff", specifier = "==0.14.5" }, ] system-tests = [ + { name = "onnx", specifier = "==1.19.1" }, { name = "robotframework", specifier = "==7.3.2" }, { name = "robotframework-screencaplibrary", specifier = "==1.6.0" }, { name = "robotremoteserver", specifier = "==1.1.1" }, @@ -629,17 +694,51 @@ name = "nvda-misc-deps" version = "20250925" source = { editable = "miscDeps" } +[[package]] +name = "onnx" +version = "1.19.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ml-dtypes", marker = "sys_platform == 'win32'" }, + { name = "numpy", marker = "sys_platform == 'win32'" }, + { name = "protobuf", marker = "sys_platform == 'win32'" }, + { name = "typing-extensions", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/27/2f/c619eb65769357e9b6de9212c9a821ab39cd484448e5d6b3fb5fb0a64c6d/onnx-1.19.1.tar.gz", hash = "sha256:737524d6eb3907d3499ea459c6f01c5a96278bb3a0f2ff8ae04786fb5d7f1ed5", size = 12033525, upload-time = "2025-10-10T04:01:34.342Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/b5/4201254b8683129db5da3fb55aa1f7e56d0a8d45c66ce875dec21ca1ff25/onnx-1.19.1-cp313-cp313-win32.whl", hash = "sha256:65eee353a51b4e4ca3e797784661e5376e2b209f17557e04921eac9166a8752e", size = 16345330, upload-time = "2025-10-10T04:00:54.858Z" }, + { url = "https://files.pythonhosted.org/packages/69/67/c6d239afbcdbeb6805432969b908b5c9f700c96d332b34e3f99518d76caf/onnx-1.19.1-cp313-cp313-win_amd64.whl", hash = "sha256:c3bc87e38b53554b1fc9ef7b275c81c6f5c93c90a91935bb0aa8d4d498a6d48e", size = 16465567, upload-time = "2025-10-10T04:00:57.893Z" }, + { url = "https://files.pythonhosted.org/packages/99/fe/89f1e40f5bc54595ff0dcf5391ce19e578b528973ccc74dd99800196d30d/onnx-1.19.1-cp313-cp313-win_arm64.whl", hash = "sha256:e41496f400afb980ec643d80d5164753a88a85234fa5c06afdeebc8b7d1ec252", size = 16437562, upload-time = "2025-10-10T04:01:00.703Z" }, + { url = "https://files.pythonhosted.org/packages/4f/0d/f9d6c2237083f1aac14b37f0b03b0d81f1147a8e2af0c3828165e0a6a67b/onnx-1.19.1-cp313-cp313t-win_amd64.whl", hash = "sha256:9807d0e181f6070ee3a6276166acdc571575d1bd522fc7e89dba16fd6e7ffed9", size = 16465560, upload-time = "2025-10-10T04:01:13.212Z" }, +] + +[[package]] +name = "onnxruntime" +version = "1.23.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "coloredlogs", marker = "sys_platform == 'win32'" }, + { name = "flatbuffers", marker = "sys_platform == 'win32'" }, + { name = "numpy", marker = "sys_platform == 'win32'" }, + { name = "packaging", marker = "sys_platform == 'win32'" }, + { name = "protobuf", marker = "sys_platform == 'win32'" }, + { name = "sympy", marker = "sys_platform == 'win32'" }, +] +wheels = [ + { url = "https://files.pythonhosted.org/packages/4a/93/aba75358133b3a941d736816dd392f687e7eab77215a6e429879080b76b6/onnxruntime-1.23.2-cp313-cp313-win_amd64.whl", hash = "sha256:1f9cc0a55349c584f083c1c076e611a7c35d5b867d5d6e6d6c823bf821978088", size = 13470276, upload-time = "2025-10-22T03:47:31.193Z" }, +] + [[package]] name = "opencv-python" -version = "4.12.0.88" +version = "4.11.0.86" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", marker = "sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ac/71/25c98e634b6bdeca4727c7f6d6927b056080668c5008ad3c8fc9e7f8f6ec/opencv-python-4.12.0.88.tar.gz", hash = "sha256:8b738389cede219405f6f3880b851efa3415ccd674752219377353f017d2994d", size = 95373294, upload-time = "2025-07-07T09:20:52.389Z" } +sdist = { url = "https://files.pythonhosted.org/packages/17/06/68c27a523103dad5837dc5b87e71285280c4f098c60e4fe8a8db6486ab09/opencv-python-4.11.0.86.tar.gz", hash = "sha256:03d60ccae62304860d232272e4a4fda93c39d595780cb40b161b310244b736a4", size = 95171956, upload-time = "2025-01-16T13:52:24.737Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/02/96/213fea371d3cb2f1d537612a105792aa0a6659fb2665b22cad709a75bd94/opencv_python-4.12.0.88-cp37-abi3-win32.whl", hash = "sha256:ff554d3f725b39878ac6a2e1fa232ec509c36130927afc18a1719ebf4fbf4357", size = 30284131, upload-time = "2025-07-07T09:14:08.819Z" }, - { url = "https://files.pythonhosted.org/packages/fa/80/eb88edc2e2b11cd2dd2e56f1c80b5784d11d6e6b7f04a1145df64df40065/opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl", hash = "sha256:d98edb20aa932fd8ebd276a72627dad9dc097695b3d435a4257557bbb49a79d2", size = 39000307, upload-time = "2025-07-07T09:14:16.641Z" }, + { url = "https://files.pythonhosted.org/packages/fb/d7/1d5941a9dde095468b288d989ff6539dd69cd429dbf1b9e839013d21b6f0/opencv_python-4.11.0.86-cp37-abi3-win32.whl", hash = "sha256:810549cb2a4aedaa84ad9a1c92fbfdfc14090e2749cedf2c1589ad8359aa169b", size = 29384337, upload-time = "2025-01-16T13:52:13.549Z" }, + { url = "https://files.pythonhosted.org/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044, upload-time = "2025-01-16T13:52:21.928Z" }, ] [[package]] @@ -671,28 +770,28 @@ wheels = [ [[package]] name = "pillow" -version = "12.1.0" +version = "12.0.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d0/02/d52c733a2452ef1ffcc123b68e6606d07276b0e358db70eabad7e40042b7/pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9", size = 46977283, upload-time = "2026-01-02T09:13:29.892Z" } +sdist = { url = "https://files.pythonhosted.org/packages/5a/b0/cace85a1b0c9775a9f8f5d5423c8261c858760e2466c79b2dd184638b056/pillow-12.0.0.tar.gz", hash = "sha256:87d4f8125c9988bfbed67af47dd7a953e2fc7b0cc1e7800ec6d2080d490bb353", size = 47008828, upload-time = "2025-10-15T18:24:14.008Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/dd/c7/2530a4aa28248623e9d7f27316b42e27c32ec410f695929696f2e0e4a778/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:7b5dd7cbae20285cdb597b10eb5a2c13aa9de6cde9bb64a3c1317427b1db1ae1", size = 4062543, upload-time = "2026-01-02T09:11:31.566Z" }, - { url = "https://files.pythonhosted.org/packages/8f/1f/40b8eae823dc1519b87d53c30ed9ef085506b05281d313031755c1705f73/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:29a4cef9cb672363926f0470afc516dbf7305a14d8c54f7abbb5c199cd8f8179", size = 4138373, upload-time = "2026-01-02T09:11:33.367Z" }, - { url = "https://files.pythonhosted.org/packages/d4/77/6fa60634cf06e52139fd0e89e5bbf055e8166c691c42fb162818b7fda31d/pillow-12.1.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:681088909d7e8fa9e31b9799aaa59ba5234c58e5e4f1951b4c4d1082a2e980e0", size = 3601241, upload-time = "2026-01-02T09:11:35.011Z" }, - { url = "https://files.pythonhosted.org/packages/19/ce/c17334caea1db789163b5d855a5735e47995b0b5dc8745e9a3605d5f24c0/pillow-12.1.0-cp313-cp313-win32.whl", hash = "sha256:a786bf667724d84aa29b5db1c61b7bfdde380202aaca12c3461afd6b71743171", size = 6332551, upload-time = "2026-01-02T09:11:52.234Z" }, - { url = "https://files.pythonhosted.org/packages/e5/07/74a9d941fa45c90a0d9465098fe1ec85de3e2afbdc15cc4766622d516056/pillow-12.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:461f9dfdafa394c59cd6d818bdfdbab4028b83b02caadaff0ffd433faf4c9a7a", size = 7040087, upload-time = "2026-01-02T09:11:54.822Z" }, - { url = "https://files.pythonhosted.org/packages/88/09/c99950c075a0e9053d8e880595926302575bc742b1b47fe1bbcc8d388d50/pillow-12.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:9212d6b86917a2300669511ed094a9406888362e085f2431a7da985a6b124f45", size = 2452470, upload-time = "2026-01-02T09:11:56.522Z" }, - { url = "https://files.pythonhosted.org/packages/86/77/eacc62356b4cf81abe99ff9dbc7402750044aed02cfd6a503f7c6fc11f3e/pillow-12.1.0-cp313-cp313t-win32.whl", hash = "sha256:7315f9137087c4e0ee73a761b163fc9aa3b19f5f606a7fc08d83fd3e4379af65", size = 6336445, upload-time = "2026-01-02T09:12:14.775Z" }, - { url = "https://files.pythonhosted.org/packages/e7/3c/57d81d0b74d218706dafccb87a87ea44262c43eef98eb3b164fd000e0491/pillow-12.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:0ddedfaa8b5f0b4ffbc2fa87b556dc59f6bb4ecb14a53b33f9189713ae8053c0", size = 7045354, upload-time = "2026-01-02T09:12:16.599Z" }, - { url = "https://files.pythonhosted.org/packages/ac/82/8b9b97bba2e3576a340f93b044a3a3a09841170ab4c1eb0d5c93469fd32f/pillow-12.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:80941e6d573197a0c28f394753de529bb436b1ca990ed6e765cf42426abc39f8", size = 2454547, upload-time = "2026-01-02T09:12:18.704Z" }, + { url = "https://files.pythonhosted.org/packages/62/f2/de993bb2d21b33a98d031ecf6a978e4b61da207bef02f7b43093774c480d/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:0869154a2d0546545cde61d1789a6524319fc1897d9ee31218eae7a60ccc5643", size = 4045493, upload-time = "2025-10-15T18:22:25.758Z" }, + { url = "https://files.pythonhosted.org/packages/0e/b6/bc8d0c4c9f6f111a783d045310945deb769b806d7574764234ffd50bc5ea/pillow-12.0.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:a7921c5a6d31b3d756ec980f2f47c0cfdbce0fc48c22a39347a895f41f4a6ea4", size = 4120461, upload-time = "2025-10-15T18:22:27.286Z" }, + { url = "https://files.pythonhosted.org/packages/5d/57/d60d343709366a353dc56adb4ee1e7d8a2cc34e3fbc22905f4167cfec119/pillow-12.0.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:1ee80a59f6ce048ae13cda1abf7fbd2a34ab9ee7d401c46be3ca685d1999a399", size = 3576912, upload-time = "2025-10-15T18:22:28.751Z" }, + { url = "https://files.pythonhosted.org/packages/dd/ca/16c6926cc1c015845745d5c16c9358e24282f1e588237a4c36d2b30f182f/pillow-12.0.0-cp313-cp313-win32.whl", hash = "sha256:4cc6b3b2efff105c6a1656cfe59da4fdde2cda9af1c5e0b58529b24525d0a098", size = 6302391, upload-time = "2025-10-15T18:22:44.753Z" }, + { url = "https://files.pythonhosted.org/packages/6d/2a/dd43dcfd6dae9b6a49ee28a8eedb98c7d5ff2de94a5d834565164667b97b/pillow-12.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:4cf7fed4b4580601c4345ceb5d4cbf5a980d030fd5ad07c4d2ec589f95f09905", size = 7007477, upload-time = "2025-10-15T18:22:46.838Z" }, + { url = "https://files.pythonhosted.org/packages/77/f0/72ea067f4b5ae5ead653053212af05ce3705807906ba3f3e8f58ddf617e6/pillow-12.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:9f0b04c6b8584c2c193babcccc908b38ed29524b29dd464bc8801bf10d746a3a", size = 2435918, upload-time = "2025-10-15T18:22:48.399Z" }, + { url = "https://files.pythonhosted.org/packages/61/2b/726235842220ca95fa441ddf55dd2382b52ab5b8d9c0596fe6b3f23dafe8/pillow-12.0.0-cp313-cp313t-win32.whl", hash = "sha256:4078242472387600b2ce8d93ade8899c12bf33fa89e55ec89fe126e9d6d5d9e9", size = 6306201, upload-time = "2025-10-15T18:23:04.709Z" }, + { url = "https://files.pythonhosted.org/packages/c0/3d/2afaf4e840b2df71344ababf2f8edd75a705ce500e5dc1e7227808312ae1/pillow-12.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2c54c1a783d6d60595d3514f0efe9b37c8808746a66920315bfd34a938d7994b", size = 7013165, upload-time = "2025-10-15T18:23:06.46Z" }, + { url = "https://files.pythonhosted.org/packages/6f/75/3fa09aa5cf6ed04bee3fa575798ddf1ce0bace8edb47249c798077a81f7f/pillow-12.0.0-cp313-cp313t-win_arm64.whl", hash = "sha256:26d9f7d2b604cd23aba3e9faf795787456ac25634d82cd060556998e39c6fa47", size = 2437834, upload-time = "2025-10-15T18:23:08.194Z" }, ] [[package]] name = "platformdirs" -version = "4.5.1" +version = "4.5.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cf/86/0248f086a84f01b37aaec0fa567b397df1a119f73c16f6c7a9aac73ea309/platformdirs-4.5.1.tar.gz", hash = "sha256:61d5cdcc6065745cdd94f0f878977f8de9437be93de97c1c12f853c9c0cdcbda", size = 21715, upload-time = "2025-12-05T13:52:58.638Z" } +sdist = { url = "https://files.pythonhosted.org/packages/61/33/9611380c2bdb1225fdef633e2a9610622310fed35ab11dac9620972ee088/platformdirs-4.5.0.tar.gz", hash = "sha256:70ddccdd7c99fc5942e9fc25636a8b34d04c24b335100223152c2803e4063312", size = 21632, upload-time = "2025-10-08T17:44:48.791Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/cb/28/3bfe2fa5a7b9c46fe7e13c97bda14c895fb10fa2ebf1d0abb90e0cea7ee1/platformdirs-4.5.1-py3-none-any.whl", hash = "sha256:d03afa3963c806a9bed9d5125c8f4cb2fdaf74a55ab60e5d59b3fde758104d31", size = 18731, upload-time = "2025-12-05T13:52:56.823Z" }, + { url = "https://files.pythonhosted.org/packages/73/cb/ac7874b3e5d58441674fb70742e6c374b28b0c7cb988d37d991cde47166c/platformdirs-4.5.0-py3-none-any.whl", hash = "sha256:e578a81bb873cbb89a41fcc904c7ef523cc18284b7e3b3ccf06aca1403b7ebd3", size = 18651, upload-time = "2025-10-08T17:44:47.223Z" }, ] [[package]] @@ -711,16 +810,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/88/74/a88bf1b1efeae488a0c0b7bdf71429c313722d1fc0f377537fbe554e6180/pre_commit-4.2.0-py2.py3-none-any.whl", hash = "sha256:a009ca7205f1eb497d10b845e52c838a98b6cdd2102a6c8e4540e94ee75c58bd", size = 220707, upload-time = "2025-03-18T21:35:19.343Z" }, ] +[[package]] +name = "protobuf" +version = "6.33.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/03/a1440979a3f74f16cab3b75b0da1a1a7f922d56a8ddea96092391998edc0/protobuf-6.33.1.tar.gz", hash = "sha256:97f65757e8d09870de6fd973aeddb92f85435607235d20b2dfed93405d00c85b", size = 443432, upload-time = "2025-11-13T16:44:18.895Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/06/f1/446a9bbd2c60772ca36556bac8bfde40eceb28d9cc7838755bc41e001d8f/protobuf-6.33.1-cp310-abi3-win32.whl", hash = "sha256:f8d3fdbc966aaab1d05046d0240dd94d40f2a8c62856d41eaa141ff64a79de6b", size = 425593, upload-time = "2025-11-13T16:44:06.275Z" }, + { url = "https://files.pythonhosted.org/packages/a6/79/8780a378c650e3df849b73de8b13cf5412f521ca2ff9b78a45c247029440/protobuf-6.33.1-cp310-abi3-win_amd64.whl", hash = "sha256:923aa6d27a92bf44394f6abf7ea0500f38769d4b07f4be41cb52bd8b1123b9ed", size = 436883, upload-time = "2025-11-13T16:44:09.222Z" }, + { url = "https://files.pythonhosted.org/packages/08/b4/46310463b4f6ceef310f8348786f3cff181cea671578e3d9743ba61a459e/protobuf-6.33.1-py3-none-any.whl", hash = "sha256:d595a9fd694fdeb061a62fbe10eb039cc1e444df81ec9bb70c7fc59ebcb1eafa", size = 170477, upload-time = "2025-11-13T16:44:17.633Z" }, +] + [[package]] name = "psutil" -version = "7.2.1" +version = "7.1.3" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/73/cb/09e5184fb5fc0358d110fc3ca7f6b1d033800734d34cac10f4136cfac10e/psutil-7.2.1.tar.gz", hash = "sha256:f7583aec590485b43ca601dd9cea0dcd65bd7bb21d30ef4ddbf4ea6b5ed1bdd3", size = 490253, upload-time = "2025-12-29T08:26:00.169Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e1/88/bdd0a41e5857d5d703287598cbf08dad90aed56774ea52ae071bae9071b6/psutil-7.1.3.tar.gz", hash = "sha256:6c86281738d77335af7aec228328e944b30930899ea760ecf33a4dba66be5e74", size = 489059, upload-time = "2025-11-02T12:25:54.619Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fb/23/851cadc9764edcc18f0effe7d0bf69f727d4cf2442deb4a9f78d4e4f30f2/psutil-7.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:923f8653416604e356073e6e0bccbe7c09990acef442def2f5640dd0faa9689f", size = 139081, upload-time = "2025-12-29T08:26:12.483Z" }, - { url = "https://files.pythonhosted.org/packages/59/82/d63e8494ec5758029f31c6cb06d7d161175d8281e91d011a4a441c8a43b5/psutil-7.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:cfbe6b40ca48019a51827f20d830887b3107a74a79b01ceb8cc8de4ccb17b672", size = 134767, upload-time = "2025-12-29T08:26:14.528Z" }, - { url = "https://files.pythonhosted.org/packages/34/68/d9317542e3f2b180c4306e3f45d3c922d7e86d8ce39f941bb9e2e9d8599e/psutil-7.2.1-cp37-abi3-win_amd64.whl", hash = "sha256:b1b0671619343aa71c20ff9767eced0483e4fc9e1f489d50923738caf6a03c17", size = 136938, upload-time = "2025-12-29T08:26:41.036Z" }, - { url = "https://files.pythonhosted.org/packages/3e/73/2ce007f4198c80fcf2cb24c169884f833fe93fbc03d55d302627b094ee91/psutil-7.2.1-cp37-abi3-win_arm64.whl", hash = "sha256:0d67c1822c355aa6f7314d92018fb4268a76668a536f133599b91edd48759442", size = 133836, upload-time = "2025-12-29T08:26:43.086Z" }, + { url = "https://files.pythonhosted.org/packages/a6/82/62d68066e13e46a5116df187d319d1724b3f437ddd0f958756fc052677f4/psutil-7.1.3-cp313-cp313t-win_amd64.whl", hash = "sha256:18349c5c24b06ac5612c0428ec2a0331c26443d259e2a0144a9b24b4395b58fa", size = 249642, upload-time = "2025-11-02T12:26:07.447Z" }, + { url = "https://files.pythonhosted.org/packages/df/ad/c1cd5fe965c14a0392112f68362cfceb5230819dbb5b1888950d18a11d9f/psutil-7.1.3-cp313-cp313t-win_arm64.whl", hash = "sha256:c525ffa774fe4496282fb0b1187725793de3e7c6b29e41562733cae9ada151ee", size = 245518, upload-time = "2025-11-02T12:26:09.719Z" }, + { url = "https://files.pythonhosted.org/packages/55/4c/c3ed1a622b6ae2fd3c945a366e64eb35247a31e4db16cf5095e269e8eb3c/psutil-7.1.3-cp37-abi3-win_amd64.whl", hash = "sha256:f39c2c19fe824b47484b96f9692932248a54c43799a84282cfe58d05a6449efd", size = 247633, upload-time = "2025-11-02T12:26:33.887Z" }, + { url = "https://files.pythonhosted.org/packages/c9/ad/33b2ccec09bf96c2b2ef3f9a6f66baac8253d7565d8839e024a6b905d45d/psutil-7.1.3-cp37-abi3-win_arm64.whl", hash = "sha256:bd0d69cee829226a761e92f28140bec9a5ee9d5b4fb4b0cc589068dbfff559b1", size = 244608, upload-time = "2025-11-02T12:26:36.136Z" }, ] [[package]] @@ -820,6 +930,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/df/80/fc9d01d5ed37ba4c42ca2b55b4339ae6e200b456be3a1aaddf4a9fa99b8c/pyperclip-1.11.0-py3-none-any.whl", hash = "sha256:299403e9ff44581cb9ba2ffeed69c7aa96a008622ad0c46cb575ca75b5b84273", size = 11063, upload-time = "2025-09-26T14:40:36.069Z" }, ] +[[package]] +name = "pyreadline3" +version = "3.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/49/4cea918a08f02817aabae639e3d0ac046fef9f9180518a3ad394e22da148/pyreadline3-3.5.4.tar.gz", hash = "sha256:8d57d53039a1c75adba8e50dd3d992b28143480816187ea5efbd5c78e6c885b7", size = 99839, upload-time = "2024-09-19T02:40:10.062Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5a/dc/491b7661614ab97483abf2056be1deee4dc2490ecbf7bff9ab5cdbac86e1/pyreadline3-3.5.4-py3-none-any.whl", hash = "sha256:eaf8e6cc3c49bcccf145fc6067ba8643d1df34d604a1ec0eccbf7a18e6d3fae6", size = 83178, upload-time = "2024-09-19T02:40:08.598Z" }, +] + [[package]] name = "pyrect" version = "0.2.0" @@ -1119,6 +1238,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/52/a7/d2782e4e3f77c8450f727ba74a8f12756d5ba823d81b941f1b04da9d033a/sphinxcontrib_serializinghtml-2.0.0-py3-none-any.whl", hash = "sha256:6e2cb0eef194e10c27ec0023bfeb25badbbb5868244cf5bc5bdc04e4464bf331", size = 92072, upload-time = "2024-07-29T01:10:08.203Z" }, ] +[[package]] +name = "sympy" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "mpmath", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921, upload-time = "2025-04-27T18:05:01.611Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, +] + [[package]] name = "tomli" version = "2.3.0" @@ -1165,11 +1296,11 @@ wheels = [ [[package]] name = "urllib3" -version = "2.6.3" +version = "2.5.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } +sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, + { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, ] [[package]] @@ -1185,16 +1316,16 @@ wheels = [ [[package]] name = "virtualenv" -version = "20.36.0" +version = "20.35.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "distlib", marker = "sys_platform == 'win32'" }, { name = "filelock", marker = "sys_platform == 'win32'" }, { name = "platformdirs", marker = "sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/78/49/87e23d8f742f10f965bce5d6b285fc88a4f436b11daf6b6225d4d66f8492/virtualenv-20.36.0.tar.gz", hash = "sha256:a3601f540b515a7983508113f14e78993841adc3d83710fa70f0ac50f43b23ed", size = 6032237, upload-time = "2026-01-07T17:20:04.975Z" } +sdist = { url = "https://files.pythonhosted.org/packages/20/28/e6f1a6f655d620846bd9df527390ecc26b3805a0c5989048c210e22c5ca9/virtualenv-20.35.4.tar.gz", hash = "sha256:643d3914d73d3eeb0c552cbb12d7e82adf0e504dbf86a3182f8771a153a1971c", size = 6028799, upload-time = "2025-10-29T06:57:40.511Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/eb/6a/0af36875e0023a1f2d0b66b4051721fc26740e947696922df1665b75e5d3/virtualenv-20.36.0-py3-none-any.whl", hash = "sha256:e7ded577f3af534fd0886d4ca03277f5542053bedb98a70a989d3c22cfa5c9ac", size = 6008261, upload-time = "2026-01-07T17:20:02.87Z" }, + { url = "https://files.pythonhosted.org/packages/79/0c/c05523fa3181fdf0c9c52a6ba91a23fbf3246cc095f26f6516f9c60e6771/virtualenv-20.35.4-py3-none-any.whl", hash = "sha256:c21c9cede36c9753eeade68ba7d523529f228a403463376cf821eaae2b650f1b", size = 6005095, upload-time = "2025-10-29T06:57:37.598Z" }, ] [[package]] From ea5825b09f9e22bde8563fd94b9a2cca5948fdb1 Mon Sep 17 00:00:00 2001 From: Tianze Date: Fri, 30 Jan 2026 09:00:52 +0800 Subject: [PATCH 2/3] replace the default model with Mozilla's distilvit (#19530) Description of user facing changes: replace the default model with Mozilla's distilvit Description of developer facing changes: None Description of development approach: None --- source/_localCaptioner/modelDownloader.py | 2 +- source/config/configSpec.py | 2 +- source/gui/_localCaptioner/messageDialogs.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/source/_localCaptioner/modelDownloader.py b/source/_localCaptioner/modelDownloader.py index 476b91dd926..1f725b95f87 100644 --- a/source/_localCaptioner/modelDownloader.py +++ b/source/_localCaptioner/modelDownloader.py @@ -660,7 +660,7 @@ def _waitForRetry(self, attempt: int, threadId: int) -> bool: def downloadModelsMultithreaded( self, modelsDir: str = WritePaths.modelsDir, - modelName: str = "Xenova/vit-gpt2-image-captioning", + modelName: str = "Mozilla/distilvit", filesToDownload: list[str] | None = None, resolvePath: str = "/resolve/main", progressCallback: ProgressCallback | None = None, diff --git a/source/config/configSpec.py b/source/config/configSpec.py index 3696b6fc169..9790082625a 100644 --- a/source/config/configSpec.py +++ b/source/config/configSpec.py @@ -531,7 +531,7 @@ [automatedImageDescriptions] enable = boolean(default=false) - defaultModel = string(default="Xenova/vit-gpt2-image-captioning") + defaultModel = string(default="Mozilla/distilvit") [screenCurtain] enabled = boolean(default=false) diff --git a/source/gui/_localCaptioner/messageDialogs.py b/source/gui/_localCaptioner/messageDialogs.py index c7a3e7c32cd..27fa32d7b74 100644 --- a/source/gui/_localCaptioner/messageDialogs.py +++ b/source/gui/_localCaptioner/messageDialogs.py @@ -110,7 +110,7 @@ def openDownloadDialog(self) -> None: message=pgettext( "imageDesc", # Translators: label of dialog when downloading image captioning - "Image captioning not installed. Would you like to install (235 MB)?", + "Image captioning not installed. Would you like to install (178 MB)?", ), dialogType=DialogType.WARNING, buttons=confirmationButtons, From 6e78df165bbc3c0562f422a0a762adff7833a5f7 Mon Sep 17 00:00:00 2001 From: Tianze Date: Fri, 30 Jan 2026 13:46:56 +0800 Subject: [PATCH 3/3] fix unit test for image description (#19535) --- tests/unit/test_localCaptioner/test_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_localCaptioner/test_downloader.py b/tests/unit/test_localCaptioner/test_downloader.py index f3022a8c2da..946f5322574 100644 --- a/tests/unit/test_localCaptioner/test_downloader.py +++ b/tests/unit/test_localCaptioner/test_downloader.py @@ -39,7 +39,7 @@ def test_ensureModelsDirectory_success(self, mockMkdir): """Ensure directory is created and correct path returned.""" mockMkdir.return_value = None modelsDir = self.downloader.ensureModelsDirectory() - self.assertTrue(modelsDir.endswith("vit-gpt2-image-captioning")) + self.assertTrue(modelsDir.endswith("distilvit")) mockMkdir.assert_called_once() @patch("pathlib.Path.mkdir", side_effect=OSError("Permission denied"))