Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
9509c05
add empty release notes
TomNicholas Oct 24, 2025
4214d39
Merge branch 'main' of https://github.com/TomNicholas/VirtualiZarr
TomNicholas Oct 28, 2025
a44956c
Merge branch 'main' of https://github.com/zarr-developers/VirtualiZarr
TomNicholas Jan 24, 2026
f6dc640
Merge branch 'main' of https://github.com/zarr-developers/VirtualiZarr
TomNicholas Jan 24, 2026
3e413e2
Merge branch 'main' of https://github.com/zarr-developers/VirtualiZarr
TomNicholas Jan 25, 2026
eeb4b1e
rewrite to pass pyarrow arrays to icechunk
TomNicholas Jan 25, 2026
e6b3f59
delete generate_chunk_key
TomNicholas Jan 25, 2026
46a8802
linting
TomNicholas Jan 25, 2026
ac85e64
todos about using pyarrow for validation
TomNicholas Jan 25, 2026
e63c460
add temporary performance measuring print statements
TomNicholas Jan 26, 2026
572baaa
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 26, 2026
9e61707
add temporary performance measuring print statements
TomNicholas Jan 26, 2026
703975b
remove tests of generate_chunk_key
TomNicholas Jan 26, 2026
f00a487
turn pyarrow into a python lis
TomNicholas Jan 26, 2026
af052a2
Merge branch 'pyarrow-to-icechunk' of https://github.com/TomNicholas/…
TomNicholas Jan 26, 2026
8699fc3
todo about deps
TomNicholas Jan 26, 2026
58858f5
todo about iterators and skipping empty paths
TomNicholas Jan 26, 2026
338d744
remove todo about using an iterator
TomNicholas Jan 26, 2026
3f20e97
use pyarrow for validation check too
TomNicholas Jan 26, 2026
af93787
make pyarrow imports deferred
TomNicholas Jan 26, 2026
51c07d7
refactor into arrow.py
TomNicholas Jan 27, 2026
722a356
Add arrow.py module and fix scalar array handling in test fixtures
TomNicholas Jan 27, 2026
6866ac9
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 27, 2026
dd01697
Merge branch 'main' into pyarrow-to-icechunk
TomNicholas Mar 10, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ def _generate_chunk_entries(
)

if chunk_grid_shape == ():
return {"0": entry_generator((0,), (0,), itemsize)}
# Scalar arrays should not use this function - use ChunkManifest.from_arrays instead
raise ValueError("Use ChunkManifest.from_arrays for scalar arrays")

all_possible_combos = itertools.product(
*[range(length) for length in chunk_grid_shape]
Expand Down Expand Up @@ -403,10 +404,24 @@ def _manifest_array(
codecs=codecs,
dimension_names=dimension_names,
)
entries = _generate_chunk_entries(
shape, chunks, data_type.itemsize, _entry_from_chunk_key
)
chunkmanifest = ChunkManifest(entries=entries)

if shape == ():
# For scalar arrays, use from_arrays with 0-dimensional numpy arrays.
# This avoids ambiguity around chunk keys ("c" vs "0") - the dict-based
# ChunkManifest constructor infers shape from chunk keys, but scalar
# chunk key conventions differ between zarr versions. Using from_arrays
# with shape=() arrays is unambiguous, matching what the HDF parser does.
chunkmanifest = ChunkManifest.from_arrays(
paths=np.array("file:///foo.nc", dtype=np.dtypes.StringDType()),
offsets=np.array(0, dtype=np.uint64),
lengths=np.array(data_type.itemsize, dtype=np.uint64),
)
else:
entries = _generate_chunk_entries(
shape, chunks, data_type.itemsize, _entry_from_chunk_key
)
chunkmanifest = ChunkManifest(entries=entries)

return ManifestArray(chunkmanifest=chunkmanifest, metadata=metadata)

return _manifest_array
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,9 @@ all_parsers = [

# writers
icechunk = [
# TODO this will need to become icehunk 2.0 for passing arrow arrays
"icechunk>=1.1.2",
"pyarrow>=23.0.0",
]


Expand Down
5 changes: 3 additions & 2 deletions virtualizarr/tests/test_manifests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,15 +196,16 @@ def test_broadcast_scalar(self, manifest_array):
marr = manifest_array(shape=(), chunks=())
assert marr.shape == ()
assert marr.chunks == ()
# Scalar manifests created with from_arrays have empty string key
assert marr.manifest.dict() == {
"0": {"path": "file:///foo.0.nc", "offset": 0, "length": 0},
"": {"path": "file:///foo.nc", "offset": 0, "length": 4},
}

expanded = np.broadcast_to(marr, shape=(1,))
assert expanded.shape == (1,)
assert expanded.chunks == (1,)
assert expanded.manifest.dict() == {
"0": {"path": "file:///foo.0.nc", "offset": 0, "length": 0},
"0": {"path": "file:///foo.nc", "offset": 0, "length": 4},
}

@pytest.mark.parametrize(
Expand Down
46 changes: 1 addition & 45 deletions virtualizarr/tests/test_writers/test_icechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@

from virtualizarr.manifests import ChunkManifest, ManifestArray
from virtualizarr.tests.utils import PYTEST_TMP_DIRECTORY_URL_PREFIX
from virtualizarr.writers.icechunk import generate_chunk_key

icechunk = pytest.importorskip("icechunk")

Expand Down Expand Up @@ -258,9 +257,7 @@ def test_validate_containers(
)

# assert that an error is raised when attempting to write to icechunk
with pytest.raises(
ValueError, match="No Virtual Chunk Container set which supports prefix"
):
with pytest.raises(ValueError, match="does not start with any supported prefix"):
vds.vz.to_icechunk(icechunk_filestore)

# assert that no uncommitted changes have been written to Icechunk session
Expand Down Expand Up @@ -394,47 +391,6 @@ def test_checksum(
npt.assert_equal(pressure_array, arr)


def test_generate_chunk_key_no_offset():
# Test case without any offset (append_axis and existing_num_chunks are None)
index = (1, 2, 3)
result = generate_chunk_key(index)
assert result == [1, 2, 3], (
"The chunk key should match the index without any offset."
)


def test_generate_chunk_key_with_offset():
# Test case with offset on append_axis 1
index = (1, 2, 3)
append_axis = 1
existing_num_chunks = 5
result = generate_chunk_key(
index, append_axis=append_axis, existing_num_chunks=existing_num_chunks
)
assert result == [1, 7, 3], "The chunk key should offset the second index by 5."


def test_generate_chunk_key_zero_offset():
# Test case where existing_num_chunks is 0 (no offset should be applied)
index = (4, 5, 6)
append_axis = 1
existing_num_chunks = 0
result = generate_chunk_key(
index, append_axis=append_axis, existing_num_chunks=existing_num_chunks
)
assert result == [4, 5, 6], (
"No offset should be applied when existing_num_chunks is 0."
)


def test_generate_chunk_key_append_axis_out_of_bounds():
# Edge case where append_axis is out of bounds
index = (3, 4)
append_axis = 2 # This is out of bounds for a 2D index
with pytest.raises(ValueError):
generate_chunk_key(index, append_axis=append_axis, existing_num_chunks=1)


def test_roundtrip_coords(
manifest_array, icechunk_filestore: "IcechunkStore", icechunk_repo: "Repository"
):
Expand Down
104 changes: 104 additions & 0 deletions virtualizarr/writers/arrow.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Iterable, cast

import xarray as xr

from virtualizarr.manifests import ChunkManifest, ManifestArray

if TYPE_CHECKING:
import pyarrow as pa


@dataclass(frozen=True)
class ArrowChunkManifest:
"""Arrow-backed chunk manifest for efficient validation and writing to icechunk."""

locations: "pa.StringArray"
offsets: "pa.UInt64Array"
lengths: "pa.UInt64Array"
shape_chunk_grid: tuple[int, ...]

@classmethod
def from_manifest(cls, manifest: ChunkManifest) -> "ArrowChunkManifest":
"""Convert a ChunkManifest to Arrow arrays.

Empty paths (representing missing chunks) are converted to nulls.
"""
import pyarrow as pa

n_chunks = len(manifest)
paths_flat = manifest._paths.ravel()

# Create null mask from empty strings (True = null)
null_mask = paths_flat == ""

# Create arrays with mask applied during construction (no extra copies)
return cls(
locations=pa.array(
paths_flat.tolist(), type=pa.string(), size=n_chunks, mask=null_mask
),
offsets=pa.array(
manifest._offsets.ravel(),
type=pa.uint64(),
size=n_chunks,
mask=null_mask,
),
lengths=pa.array(
manifest._lengths.ravel(),
type=pa.uint64(),
size=n_chunks,
mask=null_mask,
),
shape_chunk_grid=manifest.shape_chunk_grid,
)


def extract_arrow_manifests(vds: xr.Dataset) -> dict[str, ArrowChunkManifest]:
"""Extract all manifests from a dataset and convert to Arrow format."""
return {
name: ArrowChunkManifest.from_manifest(cast(ManifestArray, var.data).manifest)
for name, var in vds.variables.items()
if isinstance(var.data, ManifestArray)
}


def validate_location_prefixes(
arrow_manifests: Iterable[ArrowChunkManifest],
valid_prefixes: list[str],
) -> None:
"""
Validate that all chunk locations start with one of the valid prefixes.

Uses PyArrow compute for efficient validation of large manifests.

Parameters
----------
arrow_manifests
Manifests to validate.
valid_prefixes
List of allowed location prefixes. If empty, validation is skipped.
"""
import pyarrow.compute as pc

for manifest in arrow_manifests:
locations = manifest.locations

# Build a mask of locations that match at least one prefix
# Nulls (missing chunks) become null in the result and are skipped
matches = pc.starts_with(locations, valid_prefixes[0])
for prefix in valid_prefixes[1:]:
matches = pc.or_(matches, pc.starts_with(locations, prefix))

# Check if all non-null locations match at least one prefix
all_match = pc.all(matches, skip_nulls=True)

# If any don't match then do more work to find the first offender
if all_match.is_valid and not all_match.as_py():
# Find first invalid location to report in error
invalid = pc.invert(pc.fill_null(matches, True))
invalid_indices = pc.indices_nonzero(invalid)
first_invalid_idx = invalid_indices[0].as_py()
invalid_location = locations[first_invalid_idx].as_py()
raise ValueError(
f"Location {invalid_location!r} does not start with any supported prefix: {valid_prefixes}"
)
Loading
Loading