Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
cccd359
feat: enable mypy session for documentai-toolbox
chalmerlowe Apr 16, 2026
6abd528
fix(documentai-toolbox): resolve mypy errors
chalmerlowe Apr 17, 2026
cbbacda
chore(documentai-toolbox): fix linting and formatting
chalmerlowe Apr 17, 2026
20d9ae8
fix(documentai-toolbox): move type ignore to correct line in gcs_util…
chalmerlowe Apr 17, 2026
67eb298
feat(documentai-toolbox): use ConfigOrData type alias for Block fields
chalmerlowe Apr 17, 2026
3ca1496
feat(documentai-toolbox): update ConfigOrData to include List and use…
chalmerlowe Apr 17, 2026
c2c4e4e
fix(documentai-toolbox): replace assert with explicit check in gcs_ut…
chalmerlowe Apr 17, 2026
1dea669
fix(documentai-toolbox): replace redundant cast with isinstance check…
chalmerlowe Apr 17, 2026
5c1fe11
style: blacken block.py
chalmerlowe Apr 17, 2026
94a5f3e
fix(documentai-toolbox): use specific types for Block fields and reve…
chalmerlowe Apr 17, 2026
1a972f0
fix(documentai-toolbox): fix mypy errors in block.py and bbox_convers…
chalmerlowe Apr 17, 2026
7126d34
fix(documentai-toolbox): fix remaining mypy errors in bbox_conversion.py
chalmerlowe Apr 17, 2026
f8312eb
style: blacken and format documentai-toolbox files
chalmerlowe Apr 17, 2026
bddf806
fix(documentai-toolbox): expect dict for bounding_box in Type 2 in bb…
chalmerlowe Apr 17, 2026
49a17b1
fix(documentai-toolbox): allow both list and dict for entities in blo…
chalmerlowe Apr 17, 2026
410c1ad
fix(documentai-toolbox): fix mypy error for storage import in gcs_uti…
chalmerlowe Apr 17, 2026
ff8695d
fix(lint): remove unused SimpleNamespace import in bbox_conversion.py
chalmerlowe Apr 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,14 @@
from .utilities import docai_utilities, gcs_utilities
from .wrappers import document, entity, page

__all__ = (document, page, entity, converter, docai_utilities, gcs_utilities)
__all__ = (
"document",
"page",
"entity",
"converter",
"docai_utilities",
"gcs_utilities",
)


class Python37DeprecationWarning(DeprecationWarning): # pragma: NO COVER
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,12 @@ def convert_bbox_to_docproto_bbox(block: Block) -> documentai.BoundingPoly:
y_multiplier = 1.0
normalized_vertices: List[documentai.NormalizedVertex] = []

if block.page_width and block.page_height:
if (
block.page_width
and block.page_height
and block.docproto_width is not None
and block.docproto_height is not None
):
x_multiplier = _get_multiplier(
docproto_coordinate=block.docproto_width,
external_coordinate=block.page_width,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
import dataclasses
import json
from types import SimpleNamespace
from typing import List, Optional, Type
from typing import Any, List, Optional, Type, cast

from google.cloud import documentai


def _get_target_object(json_data: any, target_object: str) -> Optional[SimpleNamespace]:
def _get_target_object(json_data: Any, target_object: str) -> Any:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The return type of _get_target_object has been changed to Any, which significantly reduces type safety. Given that the docstring explicitly states it returns a SimpleNamespace, it would be better to use Optional[SimpleNamespace] or a more specific type if possible. If this is a temporary measure to resolve mypy errors, please consider adding a TODO to refine this type later.

Copy link
Copy Markdown
Contributor Author

@chalmerlowe chalmerlowe Apr 17, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Locally Gemini and I have gone round and round today trying to find the ideal collection of types so that mypy passes and so that unittests pass.

What we have now is not necessarily ideal, but it is likely better than what we had and further churn may not be cost effective.

r"""Returns SimpleNamespace of target_object.

Args:
Expand Down Expand Up @@ -72,47 +72,21 @@ class Block:
page_number:
Optional.
"""
type_: SimpleNamespace = dataclasses.field(init=True, repr=False)
text: SimpleNamespace = dataclasses.field(init=True, repr=False)
bounding_box: Optional[SimpleNamespace] = dataclasses.field(
init=True, repr=False, default=None
)
block_references: Optional[SimpleNamespace] = dataclasses.field(
init=True, repr=False, default=None
)
block_id: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
confidence: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
page_number: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
page_width: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
page_height: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_width: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_height: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_type: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_unit: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_x: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
bounding_y: Optional[SimpleNamespace] = dataclasses.field(
init=False, repr=False, default=None
)
type_: Any = dataclasses.field(init=True, repr=False)
text: Any = dataclasses.field(init=True, repr=False)
bounding_box: Any = dataclasses.field(init=True, repr=False, default=None)
block_references: Any = dataclasses.field(init=True, repr=False, default=None)
block_id: Any = dataclasses.field(init=False, repr=False, default=None)
confidence: Any = dataclasses.field(init=False, repr=False, default=None)
page_number: Any = dataclasses.field(init=False, repr=False, default=None)
page_width: Any = dataclasses.field(init=False, repr=False, default=None)
page_height: Any = dataclasses.field(init=False, repr=False, default=None)
bounding_width: Any = dataclasses.field(init=False, repr=False, default=None)
bounding_height: Any = dataclasses.field(init=False, repr=False, default=None)
bounding_type: Any = dataclasses.field(init=False, repr=False, default=None)
bounding_unit: Any = dataclasses.field(init=False, repr=False, default=None)
bounding_x: Any = dataclasses.field(init=False, repr=False, default=None)
bounding_y: Any = dataclasses.field(init=False, repr=False, default=None)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Many fields in the Block class have been changed from SimpleNamespace or Optional[SimpleNamespace] to Any. While this might resolve immediate mypy errors, it bypasses the benefits of type checking for these attributes. It is recommended to use more specific types or maintain the Optional[SimpleNamespace] annotation where appropriate to ensure long-term maintainability.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment is OBE. Much like the above comment:

Locally Gemini and I have gone round and round today trying to find the ideal collection of types so that mypy passes and so that unittests pass.

What we have now is not necessarily ideal, but it is likely better than what we had and further churn may not be cost effective.

docproto_width: Optional[float] = dataclasses.field(
init=False, repr=False, default=None
)
Expand Down Expand Up @@ -180,7 +154,7 @@ def load_blocks_from_schema(

blocks: List[Block] = []
ens = _get_target_object(objects, entities)
for i in ens:
for i in cast(Any, ens):
Comment thread
chalmerlowe marked this conversation as resolved.
Outdated
entity = i

block_text = ""
Expand All @@ -203,11 +177,13 @@ def load_blocks_from_schema(
b = Block(
type_=block_type,
text=block_text,
bounding_box=_get_target_object(entity, normalized_vertices),
bounding_box=_get_target_object(entity, normalized_vertices)
if normalized_vertices is not None
else None,
)

if id_:
b.id_ = _get_target_object(entity, id_)
b.block_id = _get_target_object(entity, id_)
if confidence:
b.confidence = _get_target_object(entity, confidence)
if page_number and page_number in entity:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def convert_from_config(

print("-------- Converting Started --------")
files, labels, did_not_convert = _get_docproto_files(
futures_list, project_id, location, processor_id
list(futures_list), project_id, location, processor_id
)

print("-------- Finished Converting --------")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,17 +243,17 @@ def _generate_entity_annotations(
"""
entity_annotations: List[EntityAnnotation] = []
for token in page_info.page.tokens:
v: vision.Vertex = []
v: list[vision.Vertex] = []
if token.layout.bounding_poly.vertices:
for vertex in token.layout.bounding_poly.vertices:
v.append({"x": int(vertex.x), "y": int(vertex.y)})
v.append(vision.Vertex(x=int(vertex.x), y=int(vertex.y)))
else:
for normalized_vertex in token.layout.bounding_poly.normalized_vertices:
v.append(
{
"x": int(normalized_vertex.x * page_info.page.dimension.width),
"y": int(normalized_vertex.y * page_info.page.dimension.height),
}
vision.Vertex(
x=int(normalized_vertex.x * page_info.page.dimension.width),
y=int(normalized_vertex.y * page_info.page.dimension.height),
)
)

text_start_index = token.layout.text_anchor.text_segments[0].start_index
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@

from google.api_core.gapic_v1 import client_info

from google.cloud import documentai, documentai_toolbox, storage
from google.cloud import ( # type: ignore[attr-defined]
documentai,
documentai_toolbox,
storage,
)
from google.cloud.documentai_toolbox import constants


Expand Down Expand Up @@ -91,6 +95,7 @@ def get_blobs(
if gcs_uri:
gcs_bucket_name, gcs_prefix = split_gcs_uri(gcs_uri)

assert gcs_prefix is not None
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should avoid using assert outside of tests

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@parthea fixed.

if re.match(constants.FILE_CHECK_REGEX, gcs_prefix):
raise ValueError("gcs_prefix cannot contain file types")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import glob
import os
import re
from typing import Dict, Iterator, List, Optional, Type, Union
from typing import Any, Dict, Iterable, Iterator, List, Optional, Type, Union

from google.api_core.client_options import ClientOptions
from google.api_core.operation import from_gapic as operation_from_gapic
Expand Down Expand Up @@ -51,7 +51,7 @@ def _document_layout_blocks_from_shards(
shards: List[documentai.Document],
) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]:
def extract_blocks(
blocks: List[documentai.Document.DocumentLayout.DocumentLayoutBlock],
blocks: Iterable[documentai.Document.DocumentLayout.DocumentLayoutBlock],
) -> Iterator[documentai.Document.DocumentLayout.DocumentLayoutBlock]:
queue = collections.deque(blocks)

Expand Down Expand Up @@ -325,8 +325,9 @@ def _dict_to_bigquery(
bq_client = bigquery.Client(
project=project_id, client_info=gcs_utilities._get_client_info()
)
resolved_project_id = project_id or bq_client.project
table_ref = bigquery.DatasetReference(
project=project_id, dataset_id=dataset_name
project=resolved_project_id, dataset_id=dataset_name
).table(table_name)

job_config = bigquery.LoadJobConfig(
Expand All @@ -345,7 +346,7 @@ def _dict_to_bigquery(


def _apply_text_offset(
documentai_object: Union[Dict[str, Dict], List], text_offset: int
documentai_object: Union[Dict[str, Any], List[Any]], text_offset: int
) -> None:
r"""Applies a text offset to all text_segments in `documentai_object`.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,10 @@ class Entity:

_image: Optional[Image.Image] = dataclasses.field(init=False, default=None)

def __post_init__(self, page_offset: int) -> None:
def __post_init__(self, page_offset: Optional[int]) -> None:
if page_offset is None:
page_offset = 0

self.type_ = self.documentai_object.type_

if self.documentai_object.mention_text:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,17 @@
from abc import ABC
import dataclasses
from functools import cached_property
from typing import Iterable, List, Optional, Type
from typing import Iterable, List, Optional, Type, TypeVar


import pandas as pd

from google.cloud import documentai
from google.cloud.documentai_toolbox.constants import ElementWithLayout
from google.cloud.documentai_toolbox.utilities import docai_utilities

T = TypeVar("T", bound="_BasePageElement")


@dataclasses.dataclass
class Table:
Expand Down Expand Up @@ -180,9 +183,7 @@ def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment:
"""
return self.documentai_object.layout.text_anchor.text_segments[0]

def _get_children_of_element(
self, potential_children: List["_BasePageElement"]
) -> List["_BasePageElement"]:
def _get_children_of_element(self, potential_children: List[T]) -> List[T]:
"""
Filters potential child elements to identify only those fully contained within this element.

Expand Down
19 changes: 15 additions & 4 deletions packages/google-cloud-documentai-toolbox/noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,10 +480,21 @@ def prerelease_deps(session, protobuf_implementation):
@nox.session(python=DEFAULT_PYTHON_VERSION)
def mypy(session):
"""Run the type checker."""

# TODO(https://github.com/googleapis/google-cloud-python/issues/16014):
# Enable mypy once this bug is fixed.
session.skip("Temporarily skip mypy. See issue 16014")
session.install(
"mypy<1.16.0",
Comment thread
chalmerlowe marked this conversation as resolved.
"types-requests",
"types-protobuf",
"pandas-stubs",
)
session.install("-e", ".")
session.run(
"mypy",
"-p",
"google.cloud.documentai_toolbox",
"--check-untyped-defs",
"--ignore-missing-imports",
*session.posargs,
)
Comment thread
chalmerlowe marked this conversation as resolved.


@nox.session(python=DEFAULT_PYTHON_VERSION)
Expand Down
Loading