Skip to content
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/testAndPublish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ jobs:
- startupShutdown
- symbols
- vscode
- imageDescriptions
- chrome_annotations
- chrome_list
- chrome_table
Expand Down
4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,9 @@ dependencies = [
"l2m4m==1.0.4",
"pyyaml==6.0.3",
"pymdown-extensions==10.17.1",
# local image caption
"onnxruntime==1.23.2",
"numpy==2.3.5",
]

[project.urls]
Expand Down Expand Up @@ -335,6 +338,7 @@ system-tests = [
"robotframework==7.3.2",
"robotremoteserver==1.1.1",
"robotframework-screencaplibrary==1.6.0",
"onnx==1.19.1",
]
unit-tests = [
# Creating XML unit test reports
Expand Down
4 changes: 4 additions & 0 deletions source/NVDAState.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ def voiceDictsBackupDir(self) -> str:
def updatesDir(self) -> str:
return os.path.join(self.configDir, "updates")

@property
def modelsDir(self) -> str:
return os.path.join(self.configDir, "models")

@property
def nvdaConfigFile(self) -> str:
return os.path.join(self.configDir, "nvda.ini")
Expand Down
44 changes: 44 additions & 0 deletions source/_localCaptioner/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2025 NV Access Limited, Tianze
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

from logHandler import log

from .imageDescriber import ImageDescriber
from . import modelConfig

_localCaptioner: ImageDescriber | None = None


def initialize():
"""Initialise the local captioner."""
global _localCaptioner
log.debug("Initializing local captioner")
modelConfig.initialize()
_localCaptioner = ImageDescriber()


def terminate():
"""Terminate the local captioner."""
global _localCaptioner
if _localCaptioner is None:
log.error("local captioner not running")
return
log.debug("Terminating local captioner")
_localCaptioner.terminate()
_localCaptioner = None


def isModelLoaded() -> bool:
"""return if model is loaded"""
if _localCaptioner is not None:
return _localCaptioner.isModelLoaded
else:
return False


def toggleImageCaptioning() -> None:
"""do load/unload the model from memory."""
if _localCaptioner is not None:
_localCaptioner.toggleSwitch()
53 changes: 53 additions & 0 deletions source/_localCaptioner/captioner/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2025 NV Access Limited, Tianze
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

import json

from logHandler import log
from .base import ImageCaptioner


def imageCaptionerFactory(
configPath: str,
encoderPath: str | None = None,
decoderPath: str | None = None,
monomericModelPath: str | None = None,
) -> ImageCaptioner:
"""Initialize the image caption generator.

:param monomericModelPath: Path to a single merged model file.
:param encoderPath: Path to the encoder model file.
:param decoderPath: Path to the decoder model file.
:param configPath: Path to the configuration file.
:raises ValueError: If neither a single model nor both encoder and decoder are provided.
:raises FileNotFoundError: If config file not found.
:raises NotImplementedError: if model architecture is unsupported
:raises Exception: If config.json fail to load.
:return: instance of ImageCaptioner
"""
if not monomericModelPath and not (encoderPath and decoderPath):
raise ValueError(
"You must provide either 'monomericModelPath' or both 'encoderPath' and 'decoderPath'.",
)

try:
with open(configPath, "r", encoding="utf-8") as f:
config = json.load(f)
except FileNotFoundError:
raise FileNotFoundError(
f"Caption model config file {configPath} not found, "
"please download models and config file first!",
)
except Exception:
log.exception("config file not found")
raise

modelArchitecture = config["architectures"][0]
if modelArchitecture == "VisionEncoderDecoderModel":
from .vitGpt2 import VitGpt2ImageCaptioner

return VitGpt2ImageCaptioner(encoderPath, decoderPath, configPath)
else:
raise NotImplementedError("Unsupported model architectures")
24 changes: 24 additions & 0 deletions source/_localCaptioner/captioner/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# A part of NonVisual Desktop Access (NVDA)
# Copyright (C) 2025 NV Access Limited, Tianze
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt

from abc import ABC, abstractmethod


class ImageCaptioner(ABC):
"""Abstract interface for image caption generation.

Supports generate caption for image
"""

@abstractmethod
def generateCaption(self, image: str | bytes, maxLength: int | None = None) -> str:
"""
Generate a caption for the given image.

:param image: Image file path or binary data.
:param maxLength: Optional maximum length for the generated caption.
:return: The generated image caption as a string.
"""
pass
Loading
Loading