diff --git a/docs/scaling.md b/docs/scaling.md index 3e865979..b0c89097 100644 --- a/docs/scaling.md +++ b/docs/scaling.md @@ -231,8 +231,7 @@ These wrappers compose together in a pipeline: ```python from obstore.store import from_url -from obspec_utils.cache import CachingReadableStore -from obspec_utils.splitting import SplittingReadableStore +from obspec_utils.wrappers import CachingReadableStore, SplittingReadableStore from virtualizarr.registry import ObjectStoreRegistry import virtualizarr as vz diff --git a/examples/V2/goes_with_caching_stores.py b/examples/V2/goes_with_caching_stores.py index 67daa561..a2c59774 100644 --- a/examples/V2/goes_with_caching_stores.py +++ b/examples/V2/goes_with_caching_stores.py @@ -2,9 +2,9 @@ # /// script # requires-python = ">=3.11" # dependencies = [ -# "virtualizarr>=0.2.4", +# "virtualizarr>=2.4.0", # "obstore", -# "obspec-utils", +# "obspec-utils>=0.8.0", # "xarray", # "numpy", # "h5py", @@ -54,10 +54,9 @@ import time -from obspec_utils.cache import CachingReadableStore -from obspec_utils.obspec import BufferedStoreReader +from obspec_utils.readers import BufferedStoreReader from obspec_utils.registry import ObjectStoreRegistry -from obspec_utils.splitting import SplittingReadableStore +from obspec_utils.wrappers import CachingReadableStore, SplittingReadableStore from obstore.store import from_url import virtualizarr as vz diff --git a/mkdocs.yml b/mkdocs.yml index 84781868..f72ffa70 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -132,6 +132,7 @@ plugins: - https://lithops-cloud.github.io/docs/objects.inv - https://docs.dask.org/en/stable/objects.inv - https://virtual-tiff.readthedocs.io/en/latest/objects.inv + - https://developmentseed.org/obspec/latest/objects.inv # https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140 markdown_extensions: - admonition diff --git a/pyproject.toml b/pyproject.toml index 6a9b73be..29c8dd5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,8 +28,9 @@ dependencies = [ "ujson", "packaging", "zarr>=3.1.0", + "obspec>=0.1.0", "obstore>=0.7.0", - "obspec_utils>=0.7.0", + "obspec_utils>=0.8.0", ] # Dependency sets under optional-dependencies are available via PyPI diff --git a/virtualizarr/parsers/dmrpp.py b/virtualizarr/parsers/dmrpp.py index 43c6425c..b03382f3 100644 --- a/virtualizarr/parsers/dmrpp.py +++ b/virtualizarr/parsers/dmrpp.py @@ -1,11 +1,12 @@ import io import warnings from pathlib import Path -from typing import Any, Iterable +from typing import Any, Iterable, Protocol from xml.etree import ElementTree as ET import numpy as np -from obspec_utils.obspec import EagerStoreReader, ReadableStore +from obspec import Get, GetRanges, Head +from obspec_utils.readers import EagerStoreReader from obspec_utils.registry import ObjectStoreRegistry from virtualizarr.manifests import ( @@ -20,6 +21,18 @@ class DMRPPParser: + """Parser for OPeNDAP DMR++ XML files.""" + + class Store(Get, GetRanges, Head, Protocol): + """ + Store protocol required by DMRPPParser. + + DMRPPParser uses EagerStoreReader which requires Get + GetRanges + Head + protocols to fetch the DMR++ XML file efficiently using parallel requests. + """ + + pass + def __init__( self, group: str | None = None, @@ -42,10 +55,10 @@ def __init__( def __call__( self, url: str, - registry: ObjectStoreRegistry, + registry: ObjectStoreRegistry["DMRPPParser.Store"], ) -> ManifestStore: """ - Parse the metadata and byte offsets from a given DMR++ file to product a + Parse the metadata and byte offsets from a given DMR++ file to produce a VirtualiZarr ManifestStore. Parameters @@ -53,7 +66,10 @@ def __call__( url The URL of the input DMR++ file (e.g., "s3://bucket/file.dmrpp"). registry - An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for resolving urls and reading data. + An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for + resolving urls and reading data. The registry must contain stores that + implement the [DMRPPParser.Store][virtualizarr.parsers.dmrpp.DMRPPParser.Store] + protocol (Get + GetRanges + Head). Returns ------- @@ -138,7 +154,7 @@ def __init__( def parse_dataset( self, - object_store: ReadableStore, + object_store: Get, group: str | None = None, ) -> ManifestStore: """ @@ -178,7 +194,7 @@ def parse_dataset( ) manifest_group = self._parse_dataset(dataset_element) - registry = ObjectStoreRegistry() + registry: ObjectStoreRegistry[Get] = ObjectStoreRegistry() registry.register(self.data_filepath, object_store) return ManifestStore(registry=registry, group=manifest_group) diff --git a/virtualizarr/parsers/hdf/hdf.py b/virtualizarr/parsers/hdf/hdf.py index e5e7ba11..be9f04b8 100644 --- a/virtualizarr/parsers/hdf/hdf.py +++ b/virtualizarr/parsers/hdf/hdf.py @@ -5,13 +5,13 @@ from typing import ( TYPE_CHECKING, Iterable, + Protocol, ) import numpy as np -from obspec_utils.obspec import ( - ParallelStoreReader, - ReadableFile, -) +from obspec import Get, GetRanges, Head +from obspec_utils.protocols import ReadableFile +from obspec_utils.readers import ParallelStoreReader from obspec_utils.registry import ObjectStoreRegistry from virtualizarr.codecs import zarr_codec_config_to_v3 @@ -140,6 +140,19 @@ def _construct_manifest_group( class HDFParser: + """Parser for HDF5 and NetCDF4 files.""" + + class Store(Get, GetRanges, Head, Protocol): + """ + Store protocol required by HDFParser (with default ParallelStoreReader). + + HDFParser uses a configurable reader_factory to create file-like readers. + The default ParallelStoreReader requires Get + GetRanges + Head protocols. + If using a different reader_factory, the store requirements may differ. + """ + + pass + def __init__( self, group: str | None = None, @@ -160,8 +173,9 @@ def __init__( reader_factory A callable that creates a file-like reader from a store and path. Must return an object implementing the - [ReadableFile][obspec_utils.obspec.ReadableFile] protocol. - Default is [ParallelStoreReader][obspec_utils.obspec.ParallelStoreReader]. + [ReadableFile][obspec_utils.protocols.ReadableFile] protocol. + Default is [ParallelStoreReader][obspec_utils.readers.ParallelStoreReader], + which requires stores implementing Get + GetRanges + Head. """ self.group = group self.drop_variables = drop_variables @@ -170,7 +184,7 @@ def __init__( def __call__( self, url: str, - registry: ObjectStoreRegistry, + registry: ObjectStoreRegistry[HDFParser.Store], ) -> ManifestStore: """ Parse the metadata and byte offsets from a given HDF5/NetCDF4 file to produce a VirtualiZarr @@ -179,9 +193,12 @@ def __call__( Parameters ---------- url - The URL of the input HDF5/NetCDF4 file (e.g., `"s3://bucket/store.zarr"`). + The URL of the input HDF5/NetCDF4 file (e.g., `"s3://bucket/file.nc"`). registry - An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for resolving urls and reading data. + An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for + resolving urls and reading data. The registry must contain stores that + implement the [HDFParser.Store][virtualizarr.parsers.hdf.hdf.HDFParser.Store] + protocol (Get + GetRanges + Head) when using the default ParallelStoreReader. Returns ------- diff --git a/virtualizarr/parsers/kerchunk/json.py b/virtualizarr/parsers/kerchunk/json.py index 18b57772..c3566b06 100644 --- a/virtualizarr/parsers/kerchunk/json.py +++ b/virtualizarr/parsers/kerchunk/json.py @@ -1,6 +1,8 @@ from collections.abc import Iterable +from typing import Protocol import ujson +from obspec import Get from obspec_utils.registry import ObjectStoreRegistry from virtualizarr.manifests import ManifestStore @@ -8,6 +10,18 @@ class KerchunkJSONParser: + """Parser for Kerchunk JSON reference files.""" + + class Store(Get, Protocol): + """ + Store protocol required by KerchunkJSONParser. + + KerchunkJSONParser only needs to fetch the entire JSON file, so it + requires only the Get protocol from obspec. + """ + + pass + def __init__( self, group: str | None = None, @@ -35,7 +49,7 @@ def __init__( def __call__( self, url: str, - registry: ObjectStoreRegistry, + registry: ObjectStoreRegistry["KerchunkJSONParser.Store"], ) -> ManifestStore: """ Parse the metadata and byte offsets from a given Kerchunk JSON to produce a @@ -46,7 +60,10 @@ def __call__( url The URL of the input Kerchunk JSON (e.g., "s3://bucket/kerchunk.json"). registry - An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for resolving urls and reading data. + An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for + resolving urls and reading data. The registry must contain stores that + implement the [KerchunkJSONParser.Store][virtualizarr.parsers.kerchunk.json.KerchunkJSONParser.Store] + protocol (Get operation). Returns ------- diff --git a/virtualizarr/parsers/typing.py b/virtualizarr/parsers/typing.py index a3bc18bf..d327b09c 100644 --- a/virtualizarr/parsers/typing.py +++ b/virtualizarr/parsers/typing.py @@ -1,14 +1,20 @@ from __future__ import annotations -from typing import Callable, Protocol, runtime_checkable +from typing import Any, Callable, Protocol, runtime_checkable -from obspec_utils.obspec import ReadableFile, ReadableStore +from obspec_utils.protocols import ReadableFile from obspec_utils.registry import ObjectStoreRegistry from virtualizarr.manifests import ManifestStore # Type alias for reader factories -ReaderFactory = Callable[[ReadableStore, str], ReadableFile] +# Store type is Any because different readers have different protocol requirements: +# - BufferedStoreReader needs Get + GetRange +# - EagerStoreReader needs Get + GetRanges + Head +# - ParallelStoreReader needs Get + GetRanges + Head +# Each reader's __init__ declares its specific Store protocol for static type checking. +# At runtime, missing methods will raise AttributeError when called. +ReaderFactory = Callable[[Any, str], ReadableFile] @runtime_checkable diff --git a/virtualizarr/parsers/zarr.py b/virtualizarr/parsers/zarr.py index 90922037..c3f995dd 100644 --- a/virtualizarr/parsers/zarr.py +++ b/virtualizarr/parsers/zarr.py @@ -4,9 +4,10 @@ from abc import ABC, abstractmethod from collections.abc import Iterable from pathlib import Path -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Protocol import zarr +from obspec import Get, Head, HeadAsync, List, ListAsync from obspec_utils.registry import ObjectStoreRegistry from zarr.api.asynchronous import open_group as open_group_async from zarr.core.group import GroupMetadata @@ -445,6 +446,18 @@ class ZarrParser: virtualizarr.open_virtual_dataset : High-level function for opening virtual datasets. """ + class Store(Get, List, ListAsync, Head, HeadAsync, Protocol): + """ + Store protocol required by ZarrParser. + + ZarrParser needs listing capabilities to discover chunks and head/size + operations to determine chunk sizes. This protocol composes the minimal + obspec protocols required. Get is included as it's the base protocol + required by ObjectStoreRegistry. + """ + + pass + def __init__( self, group: str | None = None, @@ -468,7 +481,7 @@ def __init__( def __call__( self, url: str, - registry: ObjectStoreRegistry, + registry: ObjectStoreRegistry[ZarrParser.Store], ) -> ManifestStore: """ Parse the metadata and byte offsets from a given Zarr store to produce a @@ -485,9 +498,11 @@ def __call__( - Azure Blob Storage: "az://container/path/to/store.zarr" - HTTP/HTTPS: "https://example.com/store.zarr" - registry : ObjectStoreRegistry + registry : ObjectStoreRegistry[ZarrParser.Store] An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for - resolving urls and reading data. + resolving urls and reading data. The registry must contain stores that + implement the [ZarrParser.Store][virtualizarr.parsers.zarr.ZarrParser.Store] + protocol (List + Head operations). Returns ------- diff --git a/virtualizarr/tests/utils.py b/virtualizarr/tests/utils.py index 73cd26cd..5751aeb9 100644 --- a/virtualizarr/tests/utils.py +++ b/virtualizarr/tests/utils.py @@ -41,7 +41,7 @@ def obstore_http(url: str) -> ObjectStore: def manifest_store_from_hdf_url(url, group: str | None = None): - registry = ObjectStoreRegistry() + registry: ObjectStoreRegistry[HDFParser.Store] = ObjectStoreRegistry() registry.register(url, obstore_local(url=url)) parser = HDFParser(group=group) return parser(url=url, registry=registry)