Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docs/scaling.md
Original file line number Diff line number Diff line change
Expand Up @@ -231,8 +231,7 @@ These wrappers compose together in a pipeline:

```python
from obstore.store import from_url
from obspec_utils.cache import CachingReadableStore
from obspec_utils.splitting import SplittingReadableStore
from obspec_utils.wrappers import CachingReadableStore, SplittingReadableStore
from virtualizarr.registry import ObjectStoreRegistry
import virtualizarr as vz

Expand Down
9 changes: 4 additions & 5 deletions examples/V2/goes_with_caching_stores.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "virtualizarr>=0.2.4",
# "virtualizarr>=2.4.0",
# "obstore",
# "obspec-utils",
# "obspec-utils>=0.8.0",
# "xarray",
# "numpy",
# "h5py",
Expand Down Expand Up @@ -54,10 +54,9 @@

import time

from obspec_utils.cache import CachingReadableStore
from obspec_utils.obspec import BufferedStoreReader
from obspec_utils.readers import BufferedStoreReader
from obspec_utils.registry import ObjectStoreRegistry
from obspec_utils.splitting import SplittingReadableStore
from obspec_utils.wrappers import CachingReadableStore, SplittingReadableStore
from obstore.store import from_url

import virtualizarr as vz
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ plugins:
- https://lithops-cloud.github.io/docs/objects.inv
- https://docs.dask.org/en/stable/objects.inv
- https://virtual-tiff.readthedocs.io/en/latest/objects.inv
- https://developmentseed.org/obspec/latest/objects.inv
# https://github.com/developmentseed/titiler/blob/50934c929cca2fa8d3c408d239015f8da429c6a8/docs/mkdocs.yml#L115-L140
markdown_extensions:
- admonition
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,9 @@ dependencies = [
"ujson",
"packaging",
"zarr>=3.1.0",
"obspec>=0.1.0",
"obstore>=0.7.0",
"obspec_utils>=0.7.0",
"obspec_utils>=0.8.0",
]

# Dependency sets under optional-dependencies are available via PyPI
Expand Down
30 changes: 23 additions & 7 deletions virtualizarr/parsers/dmrpp.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import io
import warnings
from pathlib import Path
from typing import Any, Iterable
from typing import Any, Iterable, Protocol
from xml.etree import ElementTree as ET

import numpy as np
from obspec_utils.obspec import EagerStoreReader, ReadableStore
from obspec import Get, GetRanges, Head
from obspec_utils.readers import EagerStoreReader
from obspec_utils.registry import ObjectStoreRegistry

from virtualizarr.manifests import (
Expand All @@ -20,6 +21,18 @@


class DMRPPParser:
"""Parser for OPeNDAP DMR++ XML files."""

class Store(Get, GetRanges, Head, Protocol):
"""
Store protocol required by DMRPPParser.

DMRPPParser uses EagerStoreReader which requires Get + GetRanges + Head
protocols to fetch the DMR++ XML file efficiently using parallel requests.
"""

pass

def __init__(
self,
group: str | None = None,
Expand All @@ -42,18 +55,21 @@ def __init__(
def __call__(
self,
url: str,
registry: ObjectStoreRegistry,
registry: ObjectStoreRegistry["DMRPPParser.Store"],
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Put this at the top of the file in order to drop the quotes:

from __future__ import annotations

) -> ManifestStore:
"""
Parse the metadata and byte offsets from a given DMR++ file to product a
Parse the metadata and byte offsets from a given DMR++ file to produce a
VirtualiZarr ManifestStore.

Parameters
----------
url
The URL of the input DMR++ file (e.g., "s3://bucket/file.dmrpp").
registry
An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for resolving urls and reading data.
An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for
resolving urls and reading data. The registry must contain stores that
implement the [DMRPPParser.Store][virtualizarr.parsers.dmrpp.DMRPPParser.Store]
protocol (Get + GetRanges + Head).

Returns
-------
Expand Down Expand Up @@ -138,7 +154,7 @@ def __init__(

def parse_dataset(
self,
object_store: ReadableStore,
object_store: Get,
group: str | None = None,
) -> ManifestStore:
"""
Expand Down Expand Up @@ -178,7 +194,7 @@ def parse_dataset(
)

manifest_group = self._parse_dataset(dataset_element)
registry = ObjectStoreRegistry()
registry: ObjectStoreRegistry[Get] = ObjectStoreRegistry()
registry.register(self.data_filepath, object_store)

return ManifestStore(registry=registry, group=manifest_group)
Expand Down
35 changes: 26 additions & 9 deletions virtualizarr/parsers/hdf/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
from typing import (
TYPE_CHECKING,
Iterable,
Protocol,
)

import numpy as np
from obspec_utils.obspec import (
ParallelStoreReader,
ReadableFile,
)
from obspec import Get, GetRanges, Head
from obspec_utils.protocols import ReadableFile
from obspec_utils.readers import ParallelStoreReader
from obspec_utils.registry import ObjectStoreRegistry

from virtualizarr.codecs import zarr_codec_config_to_v3
Expand Down Expand Up @@ -140,6 +140,19 @@ def _construct_manifest_group(


class HDFParser:
"""Parser for HDF5 and NetCDF4 files."""

class Store(Get, GetRanges, Head, Protocol):
"""
Store protocol required by HDFParser (with default ParallelStoreReader).

HDFParser uses a configurable reader_factory to create file-like readers.
The default ParallelStoreReader requires Get + GetRanges + Head protocols.
If using a different reader_factory, the store requirements may differ.
"""

pass

def __init__(
self,
group: str | None = None,
Expand All @@ -160,8 +173,9 @@ def __init__(
reader_factory
A callable that creates a file-like reader from a store and path.
Must return an object implementing the
[ReadableFile][obspec_utils.obspec.ReadableFile] protocol.
Default is [ParallelStoreReader][obspec_utils.obspec.ParallelStoreReader].
[ReadableFile][obspec_utils.protocols.ReadableFile] protocol.
Default is [ParallelStoreReader][obspec_utils.readers.ParallelStoreReader],
which requires stores implementing Get + GetRanges + Head.
"""
self.group = group
self.drop_variables = drop_variables
Expand All @@ -170,7 +184,7 @@ def __init__(
def __call__(
self,
url: str,
registry: ObjectStoreRegistry,
registry: ObjectStoreRegistry[HDFParser.Store],
) -> ManifestStore:
"""
Parse the metadata and byte offsets from a given HDF5/NetCDF4 file to produce a VirtualiZarr
Expand All @@ -179,9 +193,12 @@ def __call__(
Parameters
----------
url
The URL of the input HDF5/NetCDF4 file (e.g., `"s3://bucket/store.zarr"`).
The URL of the input HDF5/NetCDF4 file (e.g., `"s3://bucket/file.nc"`).
registry
An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for resolving urls and reading data.
An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for
resolving urls and reading data. The registry must contain stores that
implement the [HDFParser.Store][virtualizarr.parsers.hdf.hdf.HDFParser.Store]
protocol (Get + GetRanges + Head) when using the default ParallelStoreReader.

Returns
-------
Expand Down
21 changes: 19 additions & 2 deletions virtualizarr/parsers/kerchunk/json.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,27 @@
from collections.abc import Iterable
from typing import Protocol

import ujson
from obspec import Get
from obspec_utils.registry import ObjectStoreRegistry

from virtualizarr.manifests import ManifestStore
from virtualizarr.parsers.kerchunk.translator import manifestgroup_from_kerchunk_refs


class KerchunkJSONParser:
"""Parser for Kerchunk JSON reference files."""

class Store(Get, Protocol):
"""
Store protocol required by KerchunkJSONParser.

KerchunkJSONParser only needs to fetch the entire JSON file, so it
requires only the Get protocol from obspec.
"""

pass

def __init__(
self,
group: str | None = None,
Expand Down Expand Up @@ -35,7 +49,7 @@ def __init__(
def __call__(
self,
url: str,
registry: ObjectStoreRegistry,
registry: ObjectStoreRegistry["KerchunkJSONParser.Store"],
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you put this at the top of the file, you can drop the quotes:

from __future__ import annotations

That's why you didn't need the quotes in hdf.py for HDFParser.Store.

) -> ManifestStore:
"""
Parse the metadata and byte offsets from a given Kerchunk JSON to produce a
Expand All @@ -46,7 +60,10 @@ def __call__(
url
The URL of the input Kerchunk JSON (e.g., "s3://bucket/kerchunk.json").
registry
An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for resolving urls and reading data.
An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for
resolving urls and reading data. The registry must contain stores that
implement the [KerchunkJSONParser.Store][virtualizarr.parsers.kerchunk.json.KerchunkJSONParser.Store]
protocol (Get operation).

Returns
-------
Expand Down
12 changes: 9 additions & 3 deletions virtualizarr/parsers/typing.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
from __future__ import annotations

from typing import Callable, Protocol, runtime_checkable
from typing import Any, Callable, Protocol, runtime_checkable

from obspec_utils.obspec import ReadableFile, ReadableStore
from obspec_utils.protocols import ReadableFile
from obspec_utils.registry import ObjectStoreRegistry

from virtualizarr.manifests import ManifestStore

# Type alias for reader factories
ReaderFactory = Callable[[ReadableStore, str], ReadableFile]
# Store type is Any because different readers have different protocol requirements:
# - BufferedStoreReader needs Get + GetRange
# - EagerStoreReader needs Get + GetRanges + Head
# - ParallelStoreReader needs Get + GetRanges + Head
# Each reader's __init__ declares its specific Store protocol for static type checking.
# At runtime, missing methods will raise AttributeError when called.
ReaderFactory = Callable[[Any, str], ReadableFile]
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider allowing for more precise typing:

T = TypeVar("T", bound=Get)

ReaderFactory = Callable[[T, str], ReadableFile]

See related comment in hdf.py, where this allows more precise typing of the reader_factory parameter.



@runtime_checkable
Expand Down
23 changes: 19 additions & 4 deletions virtualizarr/parsers/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
from abc import ABC, abstractmethod
from collections.abc import Iterable
from pathlib import Path
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, Protocol

import zarr
from obspec import Get, Head, HeadAsync, List, ListAsync
from obspec_utils.registry import ObjectStoreRegistry
from zarr.api.asynchronous import open_group as open_group_async
from zarr.core.group import GroupMetadata
Expand Down Expand Up @@ -445,6 +446,18 @@ class ZarrParser:
virtualizarr.open_virtual_dataset : High-level function for opening virtual datasets.
"""

class Store(Get, List, ListAsync, Head, HeadAsync, Protocol):
"""
Store protocol required by ZarrParser.

ZarrParser needs listing capabilities to discover chunks and head/size
operations to determine chunk sizes. This protocol composes the minimal
obspec protocols required. Get is included as it's the base protocol
required by ObjectStoreRegistry.
"""

pass

def __init__(
self,
group: str | None = None,
Expand All @@ -468,7 +481,7 @@ def __init__(
def __call__(
self,
url: str,
registry: ObjectStoreRegistry,
registry: ObjectStoreRegistry[ZarrParser.Store],
) -> ManifestStore:
"""
Parse the metadata and byte offsets from a given Zarr store to produce a
Expand All @@ -485,9 +498,11 @@ def __call__(
- Azure Blob Storage: "az://container/path/to/store.zarr"
- HTTP/HTTPS: "https://example.com/store.zarr"

registry : ObjectStoreRegistry
registry : ObjectStoreRegistry[ZarrParser.Store]
An [ObjectStoreRegistry][obspec_utils.registry.ObjectStoreRegistry] for
resolving urls and reading data.
resolving urls and reading data. The registry must contain stores that
implement the [ZarrParser.Store][virtualizarr.parsers.zarr.ZarrParser.Store]
protocol (List + Head operations).

Returns
-------
Expand Down
2 changes: 1 addition & 1 deletion virtualizarr/tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def obstore_http(url: str) -> ObjectStore:


def manifest_store_from_hdf_url(url, group: str | None = None):
registry = ObjectStoreRegistry()
registry: ObjectStoreRegistry[HDFParser.Store] = ObjectStoreRegistry()
registry.register(url, obstore_local(url=url))
parser = HDFParser(group=group)
return parser(url=url, registry=registry)
Loading