Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
aa93b8b
updates zarr-parser to use obstore list_async instead of concurrent_map
norlandrhagen Feb 26, 2026
37dff68
removes the zarr vendor code
norlandrhagen Feb 26, 2026
2fa25a7
adds arro3-core to zarr group
norlandrhagen Feb 26, 2026
626d0b9
adds _from_arrow method
norlandrhagen Feb 27, 2026
9d6a312
adds type_checking for pa type hint + import in _from_arrow
norlandrhagen Feb 27, 2026
bab147d
extra import removed
norlandrhagen Feb 27, 2026
17e35cc
adds zarr to test-py31* test group
norlandrhagen Feb 27, 2026
6cbb7c0
Update virtualizarr/manifests/manifest.py
norlandrhagen Feb 27, 2026
b400a34
updates _from_arrow method to have paths, offsets, lengths and opt[sh…
norlandrhagen Feb 27, 2026
19122a7
merge w/ main
norlandrhagen Mar 6, 2026
e22981f
update releases.md
norlandrhagen Mar 6, 2026
fda8ce6
mypy
norlandrhagen Mar 6, 2026
bbd6a1f
mypy-2
norlandrhagen Mar 6, 2026
9cba9e8
update pyproj
norlandrhagen Mar 6, 2026
f50b724
adds new zarr parser deps and fix to acccessor
norlandrhagen Mar 6, 2026
1be91cc
Merge branch 'kerchunk_parquet_writer_pyarrow_fx' into zarr-parser-ob…
norlandrhagen Mar 6, 2026
4ed8295
fix double pyproj def
norlandrhagen Mar 6, 2026
9114613
adds requires pyarrow decorator to the test_zarr so mins deps are ok
norlandrhagen Mar 6, 2026
31c8ed0
add strange pyarrow pandas context override to more test_kerchunk.py …
norlandrhagen Mar 6, 2026
e0ddfc2
mypy again
norlandrhagen Mar 6, 2026
d96d5c5
incorporate feedback
norlandrhagen Mar 6, 2026
716a0bb
removed seperator normalization and added a method to get chunk seper…
norlandrhagen Mar 9, 2026
7e76088
Merge branch 'main' into zarr-parser-obstore-list
norlandrhagen Mar 9, 2026
5df7705
de-dup pyproj
norlandrhagen Mar 9, 2026
08232a8
mypy
norlandrhagen Mar 9, 2026
3d7ebfc
Merge branch 'main' into zarr-parser-obstore-list
norlandrhagen Mar 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ hdf = [
"imagecodecs-numcodecs==2024.6.1",
]

zarr = ["arro3-core", "pyarrow"]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Feel free to disregard, but ideally you shouldn't need to depend on both dependencies. Pyarrow is very large. I looked recently and it looks like it's gotten even larger

Image

50MB compressed is huge.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah thanks for the feedback @kylebarron. Good to know about pyarrow.


# kerchunk-based parsers
netcdf3 = [
"virtualizarr[remote]",
Expand Down Expand Up @@ -201,14 +203,17 @@ run-tests-html-cov = { cmd = "pytest -n auto --run-network-tests --verbose --cov
min-deps = ["dev", "test", "hdf", "hdf5-lib"] # VirtualiZarr/conftest.py using h5py, so the minimum set of dependencies for testing still includes hdf libs
# Inherit from min-deps to get all the test commands, along with optional dependencies
test = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]
test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py311"] # test against python 3.11
test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py312"] # test against python 3.12
test-py311 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py311"] # test against python 3.11
test-py312 = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "zarr", "py312"] # test against python 3.12
minio = ["dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "hdf5-lib", "tiff", "py312", "minio"]
minimum-versions = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "tiff", "hdf5-lib", "minimum-versions"]
upstream = ["dev", "test", "hdf", "hdf5-lib", "netcdf3", "upstream", "icechunk-dev", "py313"]
all = ["dev", "test", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "all_parsers", "all_writers", "py313"]
docs = ["docs", "dev", "remote", "hdf", "netcdf3", "fits", "icechunk", "kerchunk", "kerchunk_parquet", "hdf5-lib", "tiff", "py313"]

[tool.pixi.dependencies]
pytest = "*"

# Define commands to run within the docs environment
[tool.pixi.feature.docs.tasks]
serve-docs = { cmd = "mkdocs serve" }
Expand Down
56 changes: 55 additions & 1 deletion virtualizarr/manifests/manifest.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from __future__ import annotations

import re
from collections.abc import (
Callable,
Expand All @@ -8,13 +10,16 @@
ValuesView,
)
from pathlib import PosixPath
from typing import Any, NewType, TypedDict, cast
from typing import TYPE_CHECKING, Any, NewType, TypedDict, cast

import numpy as np

from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index
from virtualizarr.types import ChunkKey

if TYPE_CHECKING:
import pyarrow as pa

# doesn't guarantee that writers actually handle these
VALID_URI_PREFIXES = {
"s3://",
Expand Down Expand Up @@ -322,6 +327,55 @@ def from_arrays(

return obj

@classmethod
def _from_arrow(
cls,
*,
chunk_keys: "pa.Array",
Copy link
Copy Markdown
Member

@TomNicholas TomNicholas Feb 27, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know that you need to pass this - maybe instead we should pass arrow arrays with nulls for unintialized chunks?

paths: "pa.Array",
sizes: "pa.Array",
chunk_grid_shape: tuple[int, ...],
Comment thread
norlandrhagen marked this conversation as resolved.
Outdated
) -> "ChunkManifest":
"""
Create a ChunkManifest directly from PyArrow arrays.


Parameters
----------
chunk_keys
Normalized dot-separated chunk keys e.g. "1.2", as a PyArrow Utf8 array.
paths
Full paths to chunks, as a PyArrow Utf8 array.
sizes
Chunk sizes in bytes, as a PyArrow UInt64 array.
chunk_grid_shape
Shape of the chunk grid, used to pre-allocate numpy arrays.
"""
import pyarrow as pa
import pyarrow.compute as pc

paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))

if len(chunk_keys) > 0:
# split "1.2" into [1, 2], then reshape to (n_chunks, ndim), use for indexing
ndim = len(chunk_grid_shape)
indices_flat = pc.cast(
pc.split_pattern(chunk_keys, pattern=".").flatten(), pa.int64()
).to_numpy()
indices_2d = indices_flat.reshape(-1, ndim)
idx_tuple = tuple(indices_2d[:, i] for i in range(ndim))

paths_arr[idx_tuple] = paths.to_numpy(zero_copy_only=False)
lengths_arr[idx_tuple] = sizes.to_numpy()

return cls.from_arrays(
paths=paths_arr,
offsets=offsets_arr,
lengths=lengths_arr,
)

@property
def ndim_chunk_grid(self) -> int:
"""
Expand Down
140 changes: 87 additions & 53 deletions virtualizarr/parsers/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast

import numpy as np
Comment thread
norlandrhagen marked this conversation as resolved.
import zarr
from obspec_utils.registry import ObjectStoreRegistry
from zarr.api.asynchronous import open_group as open_group_async
Expand All @@ -19,10 +20,13 @@
ManifestGroup,
ManifestStore,
)
from virtualizarr.manifests.manifest import validate_and_normalize_path_to_uri
from virtualizarr.vendor.zarr.core.common import _concurrent_map
from virtualizarr.manifests.manifest import (
parse_manifest_index,
validate_and_normalize_path_to_uri,
)

if TYPE_CHECKING:
import pyarrow as pa
import zarr

ZarrArrayType = zarr.AsyncArray | zarr.Array
Expand Down Expand Up @@ -90,41 +94,67 @@ async def _handle_scalar_array(


async def _build_chunk_mapping(
chunk_keys: list[str], zarr_array: ZarrArrayType, path: str, prefix: str
) -> dict[str, dict[str, Any]]:
zarr_array: ZarrArrayType, path: str, prefix: str
) -> tuple["pa.Array", "pa.Array", "pa.Array"] | None:
"""
Build chunk mapping from a list of chunk keys.
Build chunk mapping by listing the object store with obstore.

Uses obstore's list_async with Arrow output to get chunk paths and sizes
in a single Rust-level call, avoiding per-chunk getsize calls.

Parameters
----------
chunk_keys
List of storage keys for chunks.
zarr_array
The Zarr array.
path
Base path for constructing chunk paths.
prefix
Prefix to strip from chunk keys.
Prefix to list and strip from chunk keys.

Returns
-------
dict
Mapping of normalized chunk coordinates to storage locations.
Tuple of (normalized_keys, full_paths, sizes) as PyArrow arrays, or None if no chunks found.
"""
if not chunk_keys:
return {}
import pyarrow as pa
import pyarrow.compute as pc

path_batches = []
size_batches = []
stream = zarr_array.store.store.list_async(prefix=prefix, return_arrow=True)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just grabbing the underlying obstore store is a interesting idea...

async for batch in stream:
pa_path_col = pa.array(batch.column("path"))
not_metadata = pc.invert(
pc.or_(
pc.match_substring(pa_path_col, pattern="/."),
pc.starts_with(pa_path_col, "."),
)
)

filtered_paths = pa_path_col.filter(not_metadata)
filtered_sizes = pa.array(batch.column("size")).filter(not_metadata)
path_batches.append(filtered_paths)
size_batches.append(filtered_sizes)

lengths = await _concurrent_map(
[(k,) for k in chunk_keys], zarr_array.store.getsize
if not path_batches:
return None

all_paths = pa.concat_arrays(path_batches)
all_sizes = pa.concat_arrays(size_batches)

if len(all_paths) == 0:
return None
# normalize: strip prefix, replace / with .
stripped = pc.utf8_replace_slice(
all_paths, start=0, stop=len(prefix), replacement=""
)
dict_keys = _normalize_chunk_keys(chunk_keys, prefix)
paths = [join_url(path, k) for k in chunk_keys]
offsets = [0] * len(lengths)
normalized_keys = pc.replace_substring(stripped, pattern="/", replacement=".")

return {
key: {"path": p, "offset": offset, "length": length}
for key, p, offset, length in zip(dict_keys, paths, offsets, lengths)
}
# construct full paths
full_paths = pc.binary_join_element_wise(
pa.scalar(path.rstrip("/")), all_paths, "/"
)

return normalized_keys, full_paths, all_sizes


class ZarrVersionStrategy(ABC):
Expand Down Expand Up @@ -158,24 +188,7 @@ async def get_chunk_mapping(
scalar_key = f"{prefix}0"
return await _handle_scalar_array(zarr_array, path, scalar_key)

# List all keys under the array prefix, filtering out metadata files
prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)]
if not prefix_keys:
return {}

metadata_files = {".zarray", ".zattrs", ".zgroup", ".zmetadata"}
chunk_keys = []
for key_tuple in prefix_keys:
key = key_tuple[0]
file_name = (
key[len(prefix) :]
if prefix and key.startswith(prefix)
else key.split("/")[-1]
)
if file_name not in metadata_files:
chunk_keys.append(key)

return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
return await _build_chunk_mapping(zarr_array, path, prefix)

def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
"""Convert V2 metadata to V3 format."""
Expand Down Expand Up @@ -272,12 +285,7 @@ async def get_chunk_mapping(

# List chunk keys under the c/ subdirectory
prefix = f"{name}/c/" if name else "c/"
prefix_keys = [(x,) async for x in zarr_array.store.list_prefix(prefix)]
if not prefix_keys:
return {}

chunk_keys = [x[0] for x in prefix_keys]
return await _build_chunk_mapping(chunk_keys, zarr_array, path, prefix)
return await _build_chunk_mapping(zarr_array, path, prefix)

def get_metadata(self, zarr_array: ZarrArrayType) -> ArrayV3Metadata:
"""Return V3 metadata as-is (no conversion needed)."""
Expand Down Expand Up @@ -321,18 +329,44 @@ async def build_chunk_manifest(zarr_array: ZarrArrayType, path: str) -> ChunkMan
missing, Zarr will return the fill_value for those regions when the array is read.
"""
strategy = get_strategy(zarr_array)
chunk_grid_shape = zarr_array._chunk_grid_shape

# For scalar arrays use _from_arrays
chunk_map = await strategy.get_chunk_mapping(zarr_array, path)
if zarr_array.shape == ():
if not chunk_map:
return ChunkManifest(chunk_map, shape=chunk_grid_shape)
paths_arr = np.empty(shape=chunk_grid_shape, dtype=np.dtypes.StringDType())
offsets_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
lengths_arr = np.zeros(shape=chunk_grid_shape, dtype=np.dtype("uint64"))
for key, entry in chunk_map.items():
idx = parse_manifest_index(key)
paths_arr[idx] = entry["path"]
offsets_arr[idx] = entry["offset"]
lengths_arr[idx] = entry["length"]
return ChunkManifest.from_arrays(
paths=paths_arr, offsets=offsets_arr, lengths=lengths_arr
)

if not chunk_map:
import math
# for non scalar arrays, use the new _from_arrow method
name = _get_array_name(zarr_array)
if zarr_array.metadata.zarr_format == 3:
prefix = f"{name}/c/" if name else "c/"
else:
prefix = f"{name}/" if name else ""

if zarr_array.shape and zarr_array.chunks:
chunk_grid_shape = tuple(
math.ceil(s / c) for s, c in zip(zarr_array.shape, zarr_array.chunks)
)
return ChunkManifest(chunk_map, shape=chunk_grid_shape)
result = await _build_chunk_mapping(zarr_array, path, prefix)

if result is None:
return ChunkManifest({}, shape=chunk_grid_shape)

return ChunkManifest(chunk_map)
normalized_keys, full_paths, sizes = result
return ChunkManifest._from_arrow(
chunk_keys=normalized_keys,
paths=full_paths,
sizes=sizes,
chunk_grid_shape=chunk_grid_shape,
)


def get_metadata(zarr_array: ZarrArrayType) -> ArrayV3Metadata:
Expand Down
Loading
Loading