From bf356feae52a7c1c8196e9dbdfa3f0221160ac49 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 25 Jan 2026 10:52:15 -0500 Subject: [PATCH 01/11] Add set_virtual_refs_from_arrow to core icechunk crate Add a new method that accepts Arrow arrays (StringArray, UInt64Array) for setting virtual chunk references. This avoids creating millions of Python objects when called from the Python bindings. The method: - Takes chunk_grid_shape to compute N-dimensional indices from flat arrays - Validates array lengths match and product equals chunk count - Iterates Arrow arrays efficiently and delegates to existing set_virtual_refs This is gated behind the optional "arrow" feature which adds arrow-array as a dependency. Co-Authored-By: Claude Opus 4.5 --- Cargo.lock | 124 ++++++++++++++++++++++++++++++++++++++++++ icechunk/Cargo.toml | 2 + icechunk/src/store.rs | 82 +++++++++++++++++++++++++++- 3 files changed, 207 insertions(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index 1c69200e8..322464723 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,6 +24,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" dependencies = [ "cfg-if", + "const-random", "getrandom 0.3.4", "once_cell", "version_check", @@ -110,6 +111,55 @@ version = "1.0.100" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +[[package]] +name = "arrow-array" +version = "57.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65ca404ea6191e06bf30956394173337fa9c35f445bd447fe6c21ab944e1a23c" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-buffer" +version = "57.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36356383099be0151dacc4245309895f16ba7917d79bdb71a7148659c9206c56" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + +[[package]] +name = "arrow-data" +version = "57.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf87f4ff5fc13290aa47e499a8b669a82c5977c6a1fedce22c7f542c1fd5a597" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-schema" +version = "57.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb63203e8e0e54b288d0d8043ca8fa1013820822a27692ef1b78a977d879f2c" + [[package]] name = "assert_fs" version = "1.1.3" @@ -871,6 +921,26 @@ version = "0.9.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.16", + "once_cell", + "tiny-keccak", +] + [[package]] name = "core-foundation" version = "0.9.4" @@ -967,6 +1037,12 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + [[package]] name = "crypto-bigint" version = "0.4.9" @@ -1535,6 +1611,18 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -1805,6 +1893,7 @@ name = "icechunk" version = "0.3.14" dependencies = [ "anyhow", + "arrow-array", "assert_fs", "async-recursion", "async-stream", @@ -2135,6 +2224,12 @@ version = "0.2.178" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37c93d8daa9d8a012fd8ab92f088405fb202ea0b6ab73ee2482ae66af4f42091" +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + [[package]] name = "libredox" version = "0.1.12" @@ -2296,6 +2391,25 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + [[package]] name = "num-conv" version = "0.1.0" @@ -2318,6 +2432,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" dependencies = [ "autocfg", + "libm", ] [[package]] @@ -3759,6 +3874,15 @@ dependencies = [ "time-core", ] +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + [[package]] name = "tinystr" version = "0.8.2" diff --git a/icechunk/Cargo.toml b/icechunk/Cargo.toml index 0048da9ce..a13afabe8 100644 --- a/icechunk/Cargo.toml +++ b/icechunk/Cargo.toml @@ -69,6 +69,7 @@ assert_fs = { version = "1.1.3", optional = true } flexbuffers = "25.12.19" flatbuffers = "25.12.19" urlencoding = "2.1.3" +arrow-array = { version = "57", optional = true } # reqwest's default-features enables default-tls which ultimately leads to an openssl dependencies # which we are not including as it is not included in manylinux wheels. Instead depend on rustls-tls only @@ -101,6 +102,7 @@ workspace = true [features] logs = ["dep:tracing-subscriber"] cli = ["dep:clap", "dep:anyhow", "dep:dialoguer", "dep:dirs", "dep:assert_fs"] +arrow = ["dep:arrow-array"] [[bin]] name = "icechunk" diff --git a/icechunk/src/store.rs b/icechunk/src/store.rs index 1833d06b5..7bf1c1cfb 100644 --- a/icechunk/src/store.rs +++ b/icechunk/src/store.rs @@ -23,7 +23,7 @@ use crate::{ error::ICError, format::{ ByteRange, ChunkIndices, ChunkOffset, Path, PathError, - manifest::{ChunkPayload, VirtualChunkRef}, + manifest::{ChunkPayload, VirtualChunkLocation, VirtualChunkRef}, snapshot::{ArrayShape, DimensionName, NodeData, NodeSnapshot, NodeType}, }, refs::{RefError, RefErrorKind}, @@ -437,6 +437,71 @@ impl Store { } } + /// Set virtual references from Arrow arrays. + /// + /// This is more efficient than `set_virtual_refs` when called from Python + /// as it avoids creating millions of intermediate Python objects. + /// + /// # Arguments + /// * `array_path` - Path to the array in the store + /// * `chunk_grid_shape` - Shape of the chunk grid (product must equal array lengths) + /// * `locations` - Arrow StringArray of URLs to external files + /// * `offsets` - Arrow UInt64Array of byte offsets + /// * `lengths` - Arrow UInt64Array of byte lengths + /// * `validate_container` - If true, validate locations match registered containers + #[cfg(feature = "arrow")] + #[instrument(skip(self, locations, offsets, lengths))] + pub async fn set_virtual_refs_from_arrow( + &self, + array_path: &Path, + chunk_grid_shape: &[u32], + locations: &arrow_array::StringArray, + offsets: &arrow_array::UInt64Array, + lengths: &arrow_array::UInt64Array, + validate_container: bool, + ) -> StoreResult { + use arrow_array::Array; + + let n_chunks = locations.len(); + + // Validate array lengths match + if offsets.len() != n_chunks || lengths.len() != n_chunks { + return Err(StoreErrorKind::Other( + "Arrow array lengths must match".to_string(), + ) + .into()); + } + + // Validate chunk_grid_shape product equals n_chunks + let expected: usize = chunk_grid_shape.iter().map(|&x| x as usize).product(); + if expected != n_chunks { + return Err(StoreErrorKind::Other(format!( + "chunk_grid_shape product ({}) != array length ({})", + expected, n_chunks + )) + .into()); + } + + // Build refs by iterating Arrow arrays + let refs_iter: Vec<(ChunkIndices, VirtualChunkRef)> = (0..n_chunks) + .map(|i| { + let indices = flat_to_nd_indices(i, chunk_grid_shape); + let location = VirtualChunkLocation::from_absolute_path(locations.value(i)) + .map_err(|e| StoreErrorKind::Other(e.to_string()))?; + let vref = VirtualChunkRef { + location, + offset: offsets.value(i), + length: lengths.value(i), + checksum: None, + }; + Ok((ChunkIndices(indices), vref)) + }) + .collect::, StoreError>>()?; + + // Delegate to existing method + self.set_virtual_refs(array_path, validate_container, refs_iter).await + } + #[instrument(skip(self))] pub async fn delete_dir(&self, prefix: &str) -> StoreResult<()> { if self.read_only().await { @@ -837,6 +902,21 @@ impl Store { } } +/// Convert flat index to N-dimensional indices (C-order/row-major). +/// +/// Used by `set_virtual_refs_from_arrow` to compute chunk coordinates +/// from the flat position in the Arrow arrays. +#[cfg(feature = "arrow")] +fn flat_to_nd_indices(flat: usize, shape: &[u32]) -> Vec { + let mut indices = vec![0u32; shape.len()]; + let mut remaining = flat; + for (i, &dim) in shape.iter().enumerate().rev() { + indices[i] = (remaining % dim as usize) as u32; + remaining /= dim as usize; + } + indices +} + async fn set_array_meta( path: Path, user_data: Bytes, From 5a1569fe57f57375138e7e079ad08d449b81329f Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 25 Jan 2026 11:02:54 -0500 Subject: [PATCH 02/11] Add test for set_virtual_refs_from_arrow Tests the Arrow-based virtual refs method with a 2x2 chunk grid using local filesystem virtual chunks. Verifies that chunks can be written using Arrow arrays and read back correctly. Co-Authored-By: Claude Opus 4.5 --- icechunk/tests/test_virtual_refs.rs | 74 +++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/icechunk/tests/test_virtual_refs.rs b/icechunk/tests/test_virtual_refs.rs index e0c6c3d9e..5b3b1f0ed 100644 --- a/icechunk/tests/test_virtual_refs.rs +++ b/icechunk/tests/test_virtual_refs.rs @@ -1080,3 +1080,77 @@ async fn test_zarr_store_with_multiple_virtual_chunk_containers() Ok(()) } + +#[cfg(feature = "arrow")] +#[tokio_test] +async fn test_set_virtual_refs_from_arrow() -> Result<(), Box> { + use arrow_array::{StringArray, UInt64Array}; + + let chunk_dir = TempDir::new()?; + let chunk_1 = chunk_dir.path().join("chunk-1").to_str().unwrap().to_owned(); + let chunk_2 = chunk_dir.path().join("chunk-2").to_str().unwrap().to_owned(); + let chunk_3 = chunk_dir.path().join("chunk-3").to_str().unwrap().to_owned(); + let chunk_4 = chunk_dir.path().join("chunk-4").to_str().unwrap().to_owned(); + + let bytes1 = Bytes::copy_from_slice(b"first"); + let bytes2 = Bytes::copy_from_slice(b"second"); + let bytes3 = Bytes::copy_from_slice(b"third!"); + let bytes4 = Bytes::copy_from_slice(b"fourth"); + let chunks = [ + (chunk_1.clone(), bytes1.clone()), + (chunk_2.clone(), bytes2.clone()), + (chunk_3.clone(), bytes3.clone()), + (chunk_4.clone(), bytes4.clone()), + ]; + write_chunks_to_local_fs(chunks.iter().cloned()).await; + + let repo_dir = TempDir::new()?; + let repo = create_local_repository(repo_dir.path(), Some(chunk_dir.path())).await; + let session = repo.writable_session("main").await.unwrap(); + let store = Store::from_session(Arc::new(RwLock::new(session))).await; + + // Set up array metadata + store + .set( + "zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await?; + let zarr_meta = Bytes::copy_from_slice(br#"{"zarr_format":3,"node_type":"array","attributes":{},"shape":[2,2],"data_type":"int32","chunk_grid":{"name":"regular","configuration":{"chunk_shape":[1,1]}},"chunk_key_encoding":{"name":"default","configuration":{"separator":"/"}},"fill_value":0,"codecs":[{"name":"bytes","configuration":{"endian":"little"}}],"storage_transformers":[],"dimension_names":["x","y"]}"#); + store.set("array/zarr.json", zarr_meta).await?; + + // Create Arrow arrays for 4 chunks in a 2x2 grid (C-order: [0,0], [0,1], [1,0], [1,1]) + let locations = StringArray::from(vec![ + format!("file://{}", chunk_1), + format!("file://{}", chunk_2), + format!("file://{}", chunk_3), + format!("file://{}", chunk_4), + ]); + let offsets = UInt64Array::from(vec![0, 0, 0, 0]); + let lengths = UInt64Array::from(vec![5, 6, 6, 6]); + + // Call set_virtual_refs_from_arrow + let array_path: Path = "/array".try_into().unwrap(); + let chunk_grid_shape = vec![2u32, 2u32]; + let result = store + .set_virtual_refs_from_arrow( + &array_path, + &chunk_grid_shape, + &locations, + &offsets, + &lengths, + false, // don't validate containers + ) + .await?; + + // Check result is success + assert!(matches!(result, icechunk::store::SetVirtualRefsResult::Done)); + + // Verify we can read the chunks back + assert_eq!(store.get("array/c/0/0", &ByteRange::ALL).await?, bytes1); + assert_eq!(store.get("array/c/0/1", &ByteRange::ALL).await?, bytes2); + assert_eq!(store.get("array/c/1/0", &ByteRange::ALL).await?, bytes3); + assert_eq!(store.get("array/c/1/1", &ByteRange::ALL).await?, bytes4); + + Ok(()) +} From 22df006b9d6156786864e577fca6b4e1f477f3c6 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 25 Jan 2026 11:25:36 -0500 Subject: [PATCH 03/11] Add set_virtual_refs_arr_async to icechunk-python Adds PyO3 bindings for the Arrow-based virtual refs method: - Uses pyo3-arrow for zero-copy FFI from PyArrow arrays - Accepts StringArray for locations, UInt64Array for offsets/lengths - Calls core crate's set_virtual_refs_from_arrow method Python API: - IcechunkStore.set_virtual_refs_arr_async() accepts PyArrow arrays - Significantly faster for large numbers of references Co-Authored-By: Claude Opus 4.5 --- Cargo.lock | 285 +++++++++++++++++- icechunk-python/Cargo.toml | 4 +- .../python/icechunk/_icechunk_python.pyi | 11 + icechunk-python/python/icechunk/store.py | 71 +++++ icechunk-python/src/store.rs | 89 ++++++ 5 files changed, 452 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 322464723..096d2fc78 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -122,6 +122,7 @@ dependencies = [ "arrow-data", "arrow-schema", "chrono", + "chrono-tz", "half", "hashbrown 0.16.1", "num-complex", @@ -141,6 +142,28 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-cast" +version = "57.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8e372ed52bd4ee88cc1e6c3859aa7ecea204158ac640b10e187936e7e87074" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + [[package]] name = "arrow-data" version = "57.2.0" @@ -154,11 +177,41 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrow-ord" +version = "57.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c4e0530272ca755d6814218dffd04425c5b7854b87fa741d5ff848bf50aa39" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + [[package]] name = "arrow-schema" version = "57.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6bb63203e8e0e54b288d0d8043ca8fa1013820822a27692ef1b78a977d879f2c" +dependencies = [ + "bitflags 2.10.0", +] + +[[package]] +name = "arrow-select" +version = "57.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c96d8a1c180b44ecf2e66c9a2f2bbcb8b1b6f14e165ce46ac8bde211a363411b" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num-traits", +] [[package]] name = "assert_fs" @@ -219,6 +272,15 @@ dependencies = [ "syn 2.0.113", ] +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -847,6 +909,16 @@ dependencies = [ "windows-link", ] +[[package]] +name = "chrono-tz" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" +dependencies = [ + "chrono", + "phf", +] + [[package]] name = "clap" version = "4.5.54" @@ -902,6 +974,16 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "comfy-table" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +dependencies = [ + "unicode-segmentation", + "unicode-width 0.2.2", +] + [[package]] name = "console" version = "0.16.2" @@ -1936,7 +2018,7 @@ dependencies = [ "tempfile", "test-log", "test-strategy", - "thiserror", + "thiserror 2.0.17", "tokio", "tokio-util", "tracing", @@ -1962,6 +2044,7 @@ dependencies = [ name = "icechunk-python" version = "2.0.0-alpha.0" dependencies = [ + "arrow-array", "async-stream", "async-trait", "bytes", @@ -1972,6 +2055,7 @@ dependencies = [ "itertools", "miette", "pyo3", + "pyo3-arrow", "pyo3-async-runtimes", "pyo3-bytes", "rand", @@ -1979,7 +2063,7 @@ dependencies = [ "serde", "serde_json", "strsim", - "thiserror", + "thiserror 2.0.17", "tokio", "typetag", ] @@ -2218,6 +2302,63 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + [[package]] name = "libc" version = "0.2.178" @@ -2291,6 +2432,16 @@ dependencies = [ "regex-automata", ] +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "rawpointer", +] + [[package]] name = "md-5" version = "0.10.6" @@ -2382,6 +2533,21 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "ndarray" +version = "0.17.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520080814a7a6b4a6e9070823bb24b4531daac8c4627e08ba5de8c5ef2f2752d" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -2456,6 +2622,23 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "numpy" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aac2e6a6e4468ffa092ad43c39b81c79196c2bb773b8db4085f695efe3bba17" +dependencies = [ + "half", + "libc", + "ndarray", + "num-complex", + "num-integer", + "num-traits", + "pyo3", + "pyo3-build-config", + "rustc-hash", +] + [[package]] name = "object" version = "0.37.3" @@ -2494,7 +2677,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror", + "thiserror 2.0.17", "tokio", "tracing", "url", @@ -2579,6 +2762,24 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "phf" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +dependencies = [ + "phf_shared", +] + +[[package]] +name = "phf_shared" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project" version = "1.1.10" @@ -2639,6 +2840,15 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950" +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + [[package]] name = "potential_utf" version = "0.1.4" @@ -2754,6 +2964,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab53c047fcd1a1d2a8820fe84f05d6be69e9526be40cb03b73f86b6b03e6d87d" dependencies = [ "chrono", + "chrono-tz", + "indexmap 2.12.1", "indoc", "inventory", "libc", @@ -2766,6 +2978,27 @@ dependencies = [ "unindent", ] +[[package]] +name = "pyo3-arrow" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36b9f03cb749b0326951ebb30e39eda2f32b0b9205dce67e947e65779b8faffc" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", + "chrono", + "chrono-tz", + "half", + "indexmap 2.12.1", + "numpy", + "pyo3", + "thiserror 1.0.69", +] + [[package]] name = "pyo3-async-runtimes" version = "0.27.0" @@ -2875,7 +3108,7 @@ dependencies = [ "rustc-hash", "rustls 0.23.35", "socket2 0.6.1", - "thiserror", + "thiserror 2.0.17", "tokio", "tracing", "web-time", @@ -2896,7 +3129,7 @@ dependencies = [ "rustls 0.23.35", "rustls-pki-types", "slab", - "thiserror", + "thiserror 2.0.17", "tinyvec", "tracing", "web-time", @@ -2978,6 +3211,12 @@ dependencies = [ "rand_core 0.9.3", ] +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + [[package]] name = "redox_syscall" version = "0.5.18" @@ -2995,7 +3234,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.16", "libredox", - "thiserror", + "thiserror 2.0.17", ] [[package]] @@ -3561,6 +3800,12 @@ dependencies = [ "rand_core 0.6.4", ] +[[package]] +name = "siphasher" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" + [[package]] name = "slab" version = "0.4.11" @@ -3814,13 +4059,33 @@ dependencies = [ "unicode-width 0.2.2", ] +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl 1.0.69", +] + [[package]] name = "thiserror" version = "2.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" dependencies = [ - "thiserror-impl", + "thiserror-impl 2.0.17", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.113", ] [[package]] @@ -4175,6 +4440,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b09c83c3c29d37506a3e260c08c03743a6bb66a9cd432c6934ab501a190571f" +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + [[package]] name = "unicode-width" version = "0.1.14" diff --git a/icechunk-python/Cargo.toml b/icechunk-python/Cargo.toml index 25a0b750c..4989387ce 100644 --- a/icechunk-python/Cargo.toml +++ b/icechunk-python/Cargo.toml @@ -21,8 +21,10 @@ crate-type = ["cdylib"] bytes = "1.11.0" chrono = { version = "0.4.42" } futures = "0.3.31" -icechunk = { path = "../icechunk", version = "0.3.14", features = ["logs"] } +icechunk = { path = "../icechunk", version = "0.3.14", features = ["logs", "arrow"] } itertools = "0.14.0" +pyo3-arrow = "0.15" +arrow-array = "57" pyo3 = { version = "0.27.2", features = [ "chrono", "experimental-async", diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index 3e8f85ba3..51f5ef900 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -11,6 +11,8 @@ from collections.abc import ( from enum import Enum from typing import Any, TypeAlias +import pyarrow as pa + class S3Options: """Options for accessing an S3-compatible storage backend""" def __init__( @@ -1992,6 +1994,15 @@ class PyStore: chunks: list[VirtualChunkSpec], validate_containers: bool, ) -> list[tuple[int, ...]] | None: ... + async def set_virtual_refs_arr_async( + self, + array_path: str, + chunk_grid_shape: list[int], + locations: pa.StringArray, + offsets: pa.UInt64Array, + lengths: pa.UInt64Array, + validate_containers: bool, + ) -> list[tuple[int, ...]] | None: ... async def delete(self, key: str) -> None: ... async def delete_dir(self, prefix: str) -> None: ... @property diff --git a/icechunk-python/python/icechunk/store.py b/icechunk-python/python/icechunk/store.py index a0ab1a958..0c92d4945 100644 --- a/icechunk-python/python/icechunk/store.py +++ b/icechunk-python/python/icechunk/store.py @@ -15,6 +15,8 @@ from zarr.core.sync import SyncMixin if TYPE_CHECKING: + import pyarrow as pa + from icechunk import Session @@ -352,6 +354,75 @@ async def set_virtual_refs_async( array_path, chunks, validate_containers ) + async def set_virtual_refs_arr_async( + self, + array_path: str, + chunk_grid_shape: tuple[int, ...], + locations: "pa.StringArray", + offsets: "pa.UInt64Array", + lengths: "pa.UInt64Array", + *, + validate_containers: bool = True, + ) -> list[tuple[int, ...]] | None: + """Store multiple virtual references using Arrow arrays (async version). + + This method is significantly faster than set_virtual_refs for large + numbers of references as it uses Arrow's zero-copy FFI to avoid + creating Python objects per chunk. + + Parameters + ---------- + array_path : str + The path to the array inside the Zarr store. + Example: "/groupA/groupB/outputs/my-array" + chunk_grid_shape : tuple[int, ...] + Shape of the chunk grid. The product must equal the length of the arrays. + Arrays are assumed to be flattened in C (row-major) order. + locations : pa.StringArray + PyArrow StringArray of URLs to external files containing chunk data + offsets : pa.UInt64Array + PyArrow UInt64Array of byte offsets within each file + lengths : pa.UInt64Array + PyArrow UInt64Array of byte lengths of each chunk + validate_containers : bool + If True, validate that locations match registered virtual chunk containers. + Default is True. + + Returns + ------- + list[tuple[int, ...]] | None + If all virtual references were successfully updated, returns None. + If there were validation errors, returns the chunk indices of all failed references. + + Notes + ----- + This method requires PyArrow to be installed. The arrays are passed to + Rust via Arrow's zero-copy FFI, making this much more efficient than + creating millions of VirtualChunkSpec Python objects. + + Example + ------- + >>> import pyarrow as pa + >>> locations = pa.array(["s3://bucket/file1.nc", "s3://bucket/file2.nc"]) + >>> offsets = pa.array([0, 1000], type=pa.uint64()) + >>> lengths = pa.array([1000, 1000], type=pa.uint64()) + >>> await store.set_virtual_refs_arr_async( + ... "/data", + ... chunk_grid_shape=(2,), + ... locations=locations, + ... offsets=offsets, + ... lengths=lengths, + ... ) + """ + return await self._store.set_virtual_refs_arr_async( + array_path, + list(chunk_grid_shape), + locations, + offsets, + lengths, + validate_containers, + ) + async def delete(self, key: str) -> None: """Remove a key from the store diff --git a/icechunk-python/src/store.rs b/icechunk-python/src/store.rs index 1c4b6eabf..07e0708d2 100644 --- a/icechunk-python/src/store.rs +++ b/icechunk-python/src/store.rs @@ -1,5 +1,6 @@ use std::{borrow::Cow, sync::Arc}; +use arrow_array::{Array, StringArray, UInt64Array}; use chrono::Utc; use futures::{StreamExt, TryStreamExt}; use icechunk::{ @@ -11,6 +12,7 @@ use icechunk::{ storage::ETag, store::{SetVirtualRefsResult, StoreError, StoreErrorKind}, }; +use pyo3_arrow::PyArray; use itertools::Itertools as _; use pyo3::{ conversion::IntoPyObjectExt, @@ -488,6 +490,93 @@ impl PyStore { ) } + /// Set virtual references using Arrow arrays (async version). + /// + /// This method is significantly faster than set_virtual_refs for large + /// numbers of references as it uses Arrow's zero-copy FFI to avoid + /// creating Python objects per chunk. + fn set_virtual_refs_arr_async<'py>( + &'py self, + py: Python<'py>, + array_path: String, + chunk_grid_shape: Vec, + locations: PyArray, + offsets: PyArray, + lengths: PyArray, + validate_containers: bool, + ) -> PyResult> { + let store = Arc::clone(&self.0); + + // Extract Arrow arrays from PyArrow via FFI (zero-copy) + // PyArray::into_inner() returns (Arc, Arc), we only need the array + let (locations_arr, _) = locations.into_inner(); + let (offsets_arr, _) = offsets.into_inner(); + let (lengths_arr, _) = lengths.into_inner(); + + // Downcast to concrete types + let locations: Arc = Arc::new( + locations_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| { + PyValueError::new_err("locations must be a StringArray") + })? + .clone(), + ); + let offsets: Arc = Arc::new( + offsets_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| PyValueError::new_err("offsets must be a UInt64Array"))? + .clone(), + ); + let lengths: Arc = Arc::new( + lengths_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| PyValueError::new_err("lengths must be a UInt64Array"))? + .clone(), + ); + + pyo3_async_runtimes::tokio::future_into_py::<_, Option>>>( + py, + async move { + let array_path = if !array_path.starts_with("/") { + format!("/{array_path}") + } else { + array_path + }; + + let path = Path::try_from(array_path).map_err(|e| { + PyValueError::new_err(format!("Invalid array path: {e}")) + })?; + + let res = store + .set_virtual_refs_from_arrow( + &path, + &chunk_grid_shape, + &locations, + &offsets, + &lengths, + validate_containers, + ) + .await + .map_err(PyIcechunkStoreError::from)?; + + match res { + SetVirtualRefsResult::Done => Ok(None), + SetVirtualRefsResult::FailedRefs(vec) => Python::attach(|py| { + let res = vec + .into_iter() + .map(|ci| PyTuple::new(py, ci.0).map(|tup| tup.unbind())) + .try_collect()?; + Ok(Some(res)) + }), + } + }, + ) + } + fn delete<'py>( &'py self, py: Python<'py>, From 289f8b3cd2db585d48dd30a30f8f05df3ee25566 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 25 Jan 2026 11:36:00 -0500 Subject: [PATCH 04/11] Add sync version set_virtual_refs_arr to icechunk-python Adds synchronous counterpart to set_virtual_refs_arr_async for cases where async is not needed or desired. Co-Authored-By: Claude Opus 4.5 --- .../python/icechunk/_icechunk_python.pyi | 9 ++ icechunk-python/python/icechunk/store.py | 69 +++++++++++++++ icechunk-python/src/store.rs | 88 ++++++++++++++++++- 3 files changed, 162 insertions(+), 4 deletions(-) diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index 51f5ef900..52a12dafa 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -1994,6 +1994,15 @@ class PyStore: chunks: list[VirtualChunkSpec], validate_containers: bool, ) -> list[tuple[int, ...]] | None: ... + def set_virtual_refs_arr( + self, + array_path: str, + chunk_grid_shape: list[int], + locations: pa.StringArray, + offsets: pa.UInt64Array, + lengths: pa.UInt64Array, + validate_containers: bool, + ) -> list[tuple[int, ...]] | None: ... async def set_virtual_refs_arr_async( self, array_path: str, diff --git a/icechunk-python/python/icechunk/store.py b/icechunk-python/python/icechunk/store.py index 0c92d4945..304e5ee03 100644 --- a/icechunk-python/python/icechunk/store.py +++ b/icechunk-python/python/icechunk/store.py @@ -354,6 +354,75 @@ async def set_virtual_refs_async( array_path, chunks, validate_containers ) + def set_virtual_refs_arr( + self, + array_path: str, + chunk_grid_shape: tuple[int, ...], + locations: "pa.StringArray", + offsets: "pa.UInt64Array", + lengths: "pa.UInt64Array", + *, + validate_containers: bool = True, + ) -> list[tuple[int, ...]] | None: + """Store multiple virtual references using Arrow arrays. + + This method is significantly faster than set_virtual_refs for large + numbers of references as it uses Arrow's zero-copy FFI to avoid + creating Python objects per chunk. + + Parameters + ---------- + array_path : str + The path to the array inside the Zarr store. + Example: "/groupA/groupB/outputs/my-array" + chunk_grid_shape : tuple[int, ...] + Shape of the chunk grid. The product must equal the length of the arrays. + Arrays are assumed to be flattened in C (row-major) order. + locations : pa.StringArray + PyArrow StringArray of URLs to external files containing chunk data + offsets : pa.UInt64Array + PyArrow UInt64Array of byte offsets within each file + lengths : pa.UInt64Array + PyArrow UInt64Array of byte lengths of each chunk + validate_containers : bool + If True, validate that locations match registered virtual chunk containers. + Default is True. + + Returns + ------- + list[tuple[int, ...]] | None + If all virtual references were successfully updated, returns None. + If there were validation errors, returns the chunk indices of all failed references. + + Notes + ----- + This method requires PyArrow to be installed. The arrays are passed to + Rust via Arrow's zero-copy FFI, making this much more efficient than + creating millions of VirtualChunkSpec Python objects. + + Example + ------- + >>> import pyarrow as pa + >>> locations = pa.array(["s3://bucket/file1.nc", "s3://bucket/file2.nc"]) + >>> offsets = pa.array([0, 1000], type=pa.uint64()) + >>> lengths = pa.array([1000, 1000], type=pa.uint64()) + >>> store.set_virtual_refs_arr( + ... "/data", + ... chunk_grid_shape=(2,), + ... locations=locations, + ... offsets=offsets, + ... lengths=lengths, + ... ) + """ + return self._store.set_virtual_refs_arr( + array_path, + list(chunk_grid_shape), + locations, + offsets, + lengths, + validate_containers, + ) + async def set_virtual_refs_arr_async( self, array_path: str, diff --git a/icechunk-python/src/store.rs b/icechunk-python/src/store.rs index 07e0708d2..2592c7f66 100644 --- a/icechunk-python/src/store.rs +++ b/icechunk-python/src/store.rs @@ -490,6 +490,89 @@ impl PyStore { ) } + /// Set virtual references using Arrow arrays (sync version). + /// + /// This method is significantly faster than set_virtual_refs for large + /// numbers of references as it uses Arrow's zero-copy FFI to avoid + /// creating Python objects per chunk. + fn set_virtual_refs_arr( + &self, + py: Python<'_>, + array_path: String, + chunk_grid_shape: Vec, + locations: PyArray, + offsets: PyArray, + lengths: PyArray, + validate_containers: bool, + ) -> PyIcechunkStoreResult>>> { + let store = Arc::clone(&self.0); + + // Extract Arrow arrays from PyArrow via FFI (zero-copy) + let (locations_arr, _) = locations.into_inner(); + let (offsets_arr, _) = offsets.into_inner(); + let (lengths_arr, _) = lengths.into_inner(); + + // Downcast to concrete types + let locations: Arc = Arc::new( + locations_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| PyValueError::new_err("locations must be a StringArray"))? + .clone(), + ); + let offsets: Arc = Arc::new( + offsets_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| PyValueError::new_err("offsets must be a UInt64Array"))? + .clone(), + ); + let lengths: Arc = Arc::new( + lengths_arr + .as_any() + .downcast_ref::() + .ok_or_else(|| PyValueError::new_err("lengths must be a UInt64Array"))? + .clone(), + ); + + py.detach(move || { + pyo3_async_runtimes::tokio::get_runtime().block_on(async move { + let array_path = if !array_path.starts_with("/") { + format!("/{array_path}") + } else { + array_path + }; + + let path = Path::try_from(array_path).map_err(|e| { + PyValueError::new_err(format!("Invalid array path: {e}")) + })?; + + let res = store + .set_virtual_refs_from_arrow( + &path, + &chunk_grid_shape, + &locations, + &offsets, + &lengths, + validate_containers, + ) + .await + .map_err(PyIcechunkStoreError::from)?; + + match res { + SetVirtualRefsResult::Done => Ok(None), + SetVirtualRefsResult::FailedRefs(vec) => Python::attach(|py| { + let res = vec + .into_iter() + .map(|ci| PyTuple::new(py, ci.0).map(|tup| tup.unbind())) + .try_collect()?; + Ok(Some(res)) + }), + } + }) + }) + } + /// Set virtual references using Arrow arrays (async version). /// /// This method is significantly faster than set_virtual_refs for large @@ -508,7 +591,6 @@ impl PyStore { let store = Arc::clone(&self.0); // Extract Arrow arrays from PyArrow via FFI (zero-copy) - // PyArray::into_inner() returns (Arc, Arc), we only need the array let (locations_arr, _) = locations.into_inner(); let (offsets_arr, _) = offsets.into_inner(); let (lengths_arr, _) = lengths.into_inner(); @@ -518,9 +600,7 @@ impl PyStore { locations_arr .as_any() .downcast_ref::() - .ok_or_else(|| { - PyValueError::new_err("locations must be a StringArray") - })? + .ok_or_else(|| PyValueError::new_err("locations must be a StringArray"))? .clone(), ); let offsets: Arc = Arc::new( From e956391385580fb3a7462ff3727e03aab2821160 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 25 Jan 2026 11:40:40 -0600 Subject: [PATCH 05/11] Add checksum and arr_offset params to set_virtual_refs_arr - Add checksum parameter to apply a single checksum (datetime or ETag) to all chunks in the array - Add arr_offset parameter to offset computed chunk indices, enabling append operations where new chunks start at non-zero indices - Refactor flat_to_nd_indices to compute indices forward using strides rather than reverse iteration Co-Authored-By: Claude Opus 4.5 --- .../python/icechunk/_icechunk_python.pyi | 8 +++- icechunk-python/python/icechunk/store.py | 22 ++++++++++ icechunk-python/src/store.rs | 16 +++++++ icechunk/src/store.rs | 43 ++++++++++++++----- 4 files changed, 76 insertions(+), 13 deletions(-) diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index 52a12dafa..bf984ca82 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -2001,7 +2001,9 @@ class PyStore: locations: pa.StringArray, offsets: pa.UInt64Array, lengths: pa.UInt64Array, - validate_containers: bool, + arr_offset: tuple[int, ...] | None = None, + checksum: datetime.datetime | str | None = None, + validate_containers: bool = True, ) -> list[tuple[int, ...]] | None: ... async def set_virtual_refs_arr_async( self, @@ -2010,7 +2012,9 @@ class PyStore: locations: pa.StringArray, offsets: pa.UInt64Array, lengths: pa.UInt64Array, - validate_containers: bool, + arr_offset: tuple[int, ...] | None = None, + checksum: datetime.datetime | str | None = None, + validate_containers: bool = True, ) -> list[tuple[int, ...]] | None: ... async def delete(self, key: str) -> None: ... async def delete_dir(self, prefix: str) -> None: ... diff --git a/icechunk-python/python/icechunk/store.py b/icechunk-python/python/icechunk/store.py index 304e5ee03..c1f522bd8 100644 --- a/icechunk-python/python/icechunk/store.py +++ b/icechunk-python/python/icechunk/store.py @@ -362,6 +362,8 @@ def set_virtual_refs_arr( offsets: "pa.UInt64Array", lengths: "pa.UInt64Array", *, + arr_offset: tuple[int, ...] | None = None, + checksum: datetime | str | None = None, validate_containers: bool = True, ) -> list[tuple[int, ...]] | None: """Store multiple virtual references using Arrow arrays. @@ -384,6 +386,13 @@ def set_virtual_refs_arr( PyArrow UInt64Array of byte offsets within each file lengths : pa.UInt64Array PyArrow UInt64Array of byte lengths of each chunk + arr_offset : tuple[int, ...] | None + Optional offset to add to computed chunk indices. Useful for append + operations where new chunks should be written at an offset from (0,0,...). + Must have the same length as chunk_grid_shape. Default is None. + checksum : datetime | str | None + Optional checksum for all chunks. Can be a datetime (last modified time) + or a string (ETag). Default is None. validate_containers : bool If True, validate that locations match registered virtual chunk containers. Default is True. @@ -420,6 +429,8 @@ def set_virtual_refs_arr( locations, offsets, lengths, + list(arr_offset) if arr_offset is not None else None, + checksum, validate_containers, ) @@ -431,6 +442,8 @@ async def set_virtual_refs_arr_async( offsets: "pa.UInt64Array", lengths: "pa.UInt64Array", *, + arr_offset: tuple[int, ...] | None = None, + checksum: datetime | str | None = None, validate_containers: bool = True, ) -> list[tuple[int, ...]] | None: """Store multiple virtual references using Arrow arrays (async version). @@ -453,6 +466,13 @@ async def set_virtual_refs_arr_async( PyArrow UInt64Array of byte offsets within each file lengths : pa.UInt64Array PyArrow UInt64Array of byte lengths of each chunk + arr_offset : tuple[int, ...] | None + Optional offset to add to computed chunk indices. Useful for append + operations where new chunks should be written at an offset from (0,0,...). + Must have the same length as chunk_grid_shape. Default is None. + checksum : datetime | str | None + Optional checksum for all chunks. Can be a datetime (last modified time) + or a string (ETag). Default is None. validate_containers : bool If True, validate that locations match registered virtual chunk containers. Default is True. @@ -489,6 +509,8 @@ async def set_virtual_refs_arr_async( locations, offsets, lengths, + list(arr_offset) if arr_offset is not None else None, + checksum, validate_containers, ) diff --git a/icechunk-python/src/store.rs b/icechunk-python/src/store.rs index 2592c7f66..8401077cc 100644 --- a/icechunk-python/src/store.rs +++ b/icechunk-python/src/store.rs @@ -495,6 +495,7 @@ impl PyStore { /// This method is significantly faster than set_virtual_refs for large /// numbers of references as it uses Arrow's zero-copy FFI to avoid /// creating Python objects per chunk. + #[pyo3(signature = (array_path, chunk_grid_shape, locations, offsets, lengths, arr_offset=None, checksum=None, validate_containers=true))] fn set_virtual_refs_arr( &self, py: Python<'_>, @@ -503,6 +504,8 @@ impl PyStore { locations: PyArray, offsets: PyArray, lengths: PyArray, + arr_offset: Option>, + checksum: Option, validate_containers: bool, ) -> PyIcechunkStoreResult>>> { let store = Arc::clone(&self.0); @@ -535,6 +538,9 @@ impl PyStore { .clone(), ); + // Convert checksum argument to Checksum type + let checksum: Option = checksum.map(|c| c.into()); + py.detach(move || { pyo3_async_runtimes::tokio::get_runtime().block_on(async move { let array_path = if !array_path.starts_with("/") { @@ -554,6 +560,8 @@ impl PyStore { &locations, &offsets, &lengths, + checksum, + arr_offset.as_deref(), validate_containers, ) .await @@ -578,6 +586,7 @@ impl PyStore { /// This method is significantly faster than set_virtual_refs for large /// numbers of references as it uses Arrow's zero-copy FFI to avoid /// creating Python objects per chunk. + #[pyo3(signature = (array_path, chunk_grid_shape, locations, offsets, lengths, arr_offset=None, checksum=None, validate_containers=true))] fn set_virtual_refs_arr_async<'py>( &'py self, py: Python<'py>, @@ -586,6 +595,8 @@ impl PyStore { locations: PyArray, offsets: PyArray, lengths: PyArray, + arr_offset: Option>, + checksum: Option, validate_containers: bool, ) -> PyResult> { let store = Arc::clone(&self.0); @@ -618,6 +629,9 @@ impl PyStore { .clone(), ); + // Convert checksum argument to Checksum type + let checksum: Option = checksum.map(|c| c.into()); + pyo3_async_runtimes::tokio::future_into_py::<_, Option>>>( py, async move { @@ -638,6 +652,8 @@ impl PyStore { &locations, &offsets, &lengths, + checksum, + arr_offset.as_deref(), validate_containers, ) .await diff --git a/icechunk/src/store.rs b/icechunk/src/store.rs index 7bf1c1cfb..fe8de58a7 100644 --- a/icechunk/src/store.rs +++ b/icechunk/src/store.rs @@ -23,7 +23,7 @@ use crate::{ error::ICError, format::{ ByteRange, ChunkIndices, ChunkOffset, Path, PathError, - manifest::{ChunkPayload, VirtualChunkLocation, VirtualChunkRef}, + manifest::{Checksum, ChunkPayload, VirtualChunkLocation, VirtualChunkRef}, snapshot::{ArrayShape, DimensionName, NodeData, NodeSnapshot, NodeType}, }, refs::{RefError, RefErrorKind}, @@ -448,6 +448,8 @@ impl Store { /// * `locations` - Arrow StringArray of URLs to external files /// * `offsets` - Arrow UInt64Array of byte offsets /// * `lengths` - Arrow UInt64Array of byte lengths + /// * `checksum` - Optional checksum applied to all chunks (e.g., LastModified timestamp) + /// * `arr_offset` - Optional offset to add to computed chunk indices (for append operations) /// * `validate_container` - If true, validate locations match registered containers #[cfg(feature = "arrow")] #[instrument(skip(self, locations, offsets, lengths))] @@ -458,6 +460,8 @@ impl Store { locations: &arrow_array::StringArray, offsets: &arrow_array::UInt64Array, lengths: &arrow_array::UInt64Array, + checksum: Option, + arr_offset: Option<&[u32]>, validate_container: bool, ) -> StoreResult { use arrow_array::Array; @@ -482,17 +486,29 @@ impl Store { .into()); } + // Validate arr_offset length if provided + if let Some(offset) = arr_offset { + if offset.len() != chunk_grid_shape.len() { + return Err(StoreErrorKind::Other(format!( + "arr_offset length ({}) != chunk_grid_shape length ({})", + offset.len(), + chunk_grid_shape.len() + )) + .into()); + } + } + // Build refs by iterating Arrow arrays let refs_iter: Vec<(ChunkIndices, VirtualChunkRef)> = (0..n_chunks) .map(|i| { - let indices = flat_to_nd_indices(i, chunk_grid_shape); + let indices = flat_to_nd_indices(i, chunk_grid_shape, arr_offset); let location = VirtualChunkLocation::from_absolute_path(locations.value(i)) .map_err(|e| StoreErrorKind::Other(e.to_string()))?; let vref = VirtualChunkRef { location, offset: offsets.value(i), length: lengths.value(i), - checksum: None, + checksum: checksum.clone(), }; Ok((ChunkIndices(indices), vref)) }) @@ -906,15 +922,20 @@ impl Store { /// /// Used by `set_virtual_refs_from_arrow` to compute chunk coordinates /// from the flat position in the Arrow arrays. +/// +/// If `offset` is provided, it is added element-wise to the computed indices. #[cfg(feature = "arrow")] -fn flat_to_nd_indices(flat: usize, shape: &[u32]) -> Vec { - let mut indices = vec![0u32; shape.len()]; - let mut remaining = flat; - for (i, &dim) in shape.iter().enumerate().rev() { - indices[i] = (remaining % dim as usize) as u32; - remaining /= dim as usize; - } - indices +fn flat_to_nd_indices(flat: usize, shape: &[u32], offset: Option<&[u32]>) -> Vec { + let total: usize = shape.iter().map(|&d| d as usize).product(); + shape + .iter() + .enumerate() + .scan(total, |stride, (i, &dim)| { + *stride /= dim as usize; + let idx = ((flat / *stride) % dim as usize) as u32; + Some(idx + offset.map_or(0, |o| o[i])) + }) + .collect() } async fn set_array_meta( From ee5e07e352ef877e11b4026094339db9e4969f79 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Sun, 25 Jan 2026 11:42:16 -0600 Subject: [PATCH 06/11] add todo --- icechunk/src/store.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/icechunk/src/store.rs b/icechunk/src/store.rs index fe8de58a7..46c5465ae 100644 --- a/icechunk/src/store.rs +++ b/icechunk/src/store.rs @@ -926,6 +926,7 @@ impl Store { /// If `offset` is provided, it is added element-wise to the computed indices. #[cfg(feature = "arrow")] fn flat_to_nd_indices(flat: usize, shape: &[u32], offset: Option<&[u32]>) -> Vec { + // TODO: check that this vibe-coded function makes any sense at all... let total: usize = shape.iter().map(|&d| d as usize).product(); shape .iter() From 33f213f5b89e8496f243e45ae68249588519299f Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 26 Jan 2026 12:28:23 -0600 Subject: [PATCH 07/11] use set_node_chunk_ref --- icechunk/src/session.rs | 18 +++++++++--------- icechunk/src/store.rs | 6 ++++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index cf4955a8d..6ee2b74fb 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -622,7 +622,7 @@ impl Session { ) -> SessionResult<()> { let node = self.get_array(node_path).await?; for coord in coords { - self.set_node_chunk_ref(node.clone(), coord, None).await? + self.set_node_chunk_ref(&node, coord, None).await? } Ok(()) } @@ -638,7 +638,7 @@ impl Session { data: Option, ) -> SessionResult<()> { let node_snapshot = self.get_array(&path).await?; - self.set_node_chunk_ref(node_snapshot, coord, data).await + self.set_node_chunk_ref(&node_snapshot, coord, data).await } pub fn lookup_splits(&self, node_id: &NodeId) -> Option<&ManifestSplits> { @@ -690,21 +690,21 @@ impl Session { }) } - // Helper function that accepts a NodeSnapshot instead of a path, - // this lets us do bulk sets (and deletes) without repeatedly grabbing the node. + /// Helper function that accepts a NodeSnapshot instead of a path, + /// this lets us do bulk sets (and deletes) without repeatedly grabbing the node. #[instrument(skip(self))] - async fn set_node_chunk_ref( + pub async fn set_node_chunk_ref( &mut self, - node: NodeSnapshot, + node: &NodeSnapshot, coord: ChunkIndices, data: Option, ) -> SessionResult<()> { - if let NodeData::Array { shape, dimension_names, .. } = node.node_data { + if let NodeData::Array { shape, dimension_names, .. } = &node.node_data { if shape.valid_chunk_coord(&coord) { let splits = self - .get_splits(&node.id, &node.path, &shape, &dimension_names) + .get_splits(&node.id, &node.path, shape, dimension_names) .clone(); - self.change_set_mut()?.set_chunk_ref(node.id, coord, data, &splits)?; + self.change_set_mut()?.set_chunk_ref(node.id.clone(), coord, data, &splits)?; Ok(()) } else { Err(SessionErrorKind::InvalidIndex { diff --git a/icechunk/src/store.rs b/icechunk/src/store.rs index 46c5465ae..26ac92dbf 100644 --- a/icechunk/src/store.rs +++ b/icechunk/src/store.rs @@ -414,6 +414,8 @@ impl Store { } let mut session = self.session.write().await; + // Look up the node once, not for every chunk + let node = session.get_array(array_path).await?; let mut failed = Vec::new(); for (index, reference) in references.into_iter() { if validate_container @@ -422,8 +424,8 @@ impl Store { failed.push(index); } else { session - .set_chunk_ref( - array_path.clone(), + .set_node_chunk_ref( + &node, index, Some(ChunkPayload::Virtual(reference)), ) From f02dba099490ac7e7b5f20c87a317f2c021825f5 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 26 Jan 2026 12:35:31 -0600 Subject: [PATCH 08/11] Optimize set_virtual_refs with bulk operations - Add set_chunk_refs bulk method to ChangeSet that does one hash lookup for all chunks instead of one per chunk - Add set_node_chunk_refs bulk method to Session - Change set_node_chunk_ref to take &NodeSnapshot instead of owned - Update set_virtual_refs to use bulk method - Make set_node_chunk_ref public for external use This reduces the complexity from O(n) hash lookups to O(1) for setting n chunk refs to the same array. Co-Authored-By: Claude Opus 4.5 --- icechunk/src/change_set.rs | 48 ++++++++++++++++++++++++++++++++++++++ icechunk/src/session.rs | 26 +++++++++++++++++++++ icechunk/src/store.rs | 35 ++++++++++++++------------- 3 files changed, 93 insertions(+), 16 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index 58994534f..c0f010104 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -494,6 +494,54 @@ impl ChangeSet { Ok(()) } + /// Bulk version of set_chunk_ref - sets multiple chunk refs for the same node. + /// More efficient than calling set_chunk_ref repeatedly as it only does one hash lookup. + pub fn set_chunk_refs( + &mut self, + node_id: NodeId, + chunks: I, + splits: &ManifestSplits, + ) -> SessionResult<()> + where + I: IntoIterator)>, + { + let edits = self.edits_mut()?; + + let node_chunks = edits + .set_chunks + .entry(node_id) + .or_insert_with(|| { + HashMap::< + ManifestExtents, + BTreeMap>, + >::with_capacity(splits.len()) + }); + + for (coord, data) in chunks { + #[allow(clippy::expect_used)] + let extent = splits.find(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest."); + + let old = node_chunks + .entry(extent.clone()) + .or_default() + .insert(coord, data); + + if old.is_none() { + edits.num_chunks += 1; + } + } + + if edits.num_chunks > NUM_CHUNKS_LIMIT && !edits.excessive_num_chunks_warned { + warn!( + "There are more than {NUM_CHUNKS_LIMIT} chunk references being loaded into this commit. This is close to the maximum number of chunk modifications Icechunk supports in a single commit, we recommend to split into smaller commits." + ); + + edits.excessive_num_chunks_warned = true; + } + + Ok(()) + } + pub fn get_chunk_ref( &self, node_id: &NodeId, diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index 6ee2b74fb..e664c5409 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -722,6 +722,32 @@ impl Session { } } + /// Bulk version - sets multiple chunk refs for a single node. + /// More efficient than calling set_node_chunk_ref repeatedly. + #[instrument(skip(self, chunks))] + pub fn set_node_chunk_refs( + &mut self, + node: &NodeSnapshot, + chunks: I, + ) -> SessionResult<()> + where + I: IntoIterator)>, + { + if let NodeData::Array { shape, dimension_names, .. } = &node.node_data { + let splits = self + .get_splits(&node.id, &node.path, shape, dimension_names) + .clone(); + self.change_set_mut()?.set_chunk_refs(node.id.clone(), chunks, &splits)?; + Ok(()) + } else { + Err(SessionErrorKind::NotAnArray { + node: Box::new(node.clone()), + message: "setting chunk refs".to_string(), + } + .into()) + } + } + #[instrument(skip(self))] pub async fn get_closest_ancestor_node( &self, diff --git a/icechunk/src/store.rs b/icechunk/src/store.rs index 26ac92dbf..ef888373a 100644 --- a/icechunk/src/store.rs +++ b/icechunk/src/store.rs @@ -414,24 +414,27 @@ impl Store { } let mut session = self.session.write().await; - // Look up the node once, not for every chunk let node = session.get_array(array_path).await?; + + // Separate valid refs from failed ones let mut failed = Vec::new(); - for (index, reference) in references.into_iter() { - if validate_container - && session.matching_container(&reference.location).is_none() - { - failed.push(index); - } else { - session - .set_node_chunk_ref( - &node, - index, - Some(ChunkPayload::Virtual(reference)), - ) - .await?; - } - } + let valid_refs: Vec<_> = references + .into_iter() + .filter_map(|(index, reference)| { + if validate_container + && session.matching_container(&reference.location).is_none() + { + failed.push(index); + None + } else { + Some((index, Some(ChunkPayload::Virtual(reference)))) + } + }) + .collect(); + + // Use bulk method - one hash lookup for all refs + session.set_node_chunk_refs(&node, valid_refs)?; + if failed.is_empty() { Ok(SetVirtualRefsResult::Done) } else { From aa84a069d19cfd51c9c0761f3bdee40332f65c82 Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 26 Jan 2026 14:50:33 -0600 Subject: [PATCH 09/11] fix bug with writing empty refs --- icechunk/src/store.rs | 69 ++++++++++++++++++++++++++++- icechunk/tests/test_virtual_refs.rs | 2 + 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/icechunk/src/store.rs b/icechunk/src/store.rs index ef888373a..f08b10e5f 100644 --- a/icechunk/src/store.rs +++ b/icechunk/src/store.rs @@ -503,8 +503,9 @@ impl Store { } } - // Build refs by iterating Arrow arrays + // Build refs by iterating Arrow arrays, skipping empty paths let refs_iter: Vec<(ChunkIndices, VirtualChunkRef)> = (0..n_chunks) + .filter(|&i| !locations.value(i).is_empty()) .map(|i| { let indices = flat_to_nd_indices(i, chunk_grid_shape, arr_offset); let location = VirtualChunkLocation::from_absolute_path(locations.value(i)) @@ -2530,4 +2531,70 @@ mod tests { Bytes::copy_from_slice(br#"{"zarr_format":3,"node_type":"group"}"#) ); } + + #[cfg(feature = "arrow")] + #[tokio::test] + async fn test_set_virtual_refs_from_arrow_skips_empty_paths() { + use arrow_array::{StringArray, UInt64Array}; + + let repo = create_memory_store_repository().await; + let session = Arc::new(RwLock::new(repo.writable_session("main").await.unwrap())); + let store = Store::from_session(Arc::clone(&session)).await; + + // Create a group and array + store + .set( + "zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3,"node_type":"group"}"#), + ) + .await + .unwrap(); + + store + .set( + "array/zarr.json", + Bytes::copy_from_slice( + br#"{"zarr_format":3,"node_type":"array","shape":[3],"data_type":"int32","chunk_grid":{"name":"regular","configuration":{"chunk_shape":[1]}},"chunk_key_encoding":{"name":"default","configuration":{"separator":"/"}},"fill_value":0,"codecs":[{"name":"bytes","configuration":{"endian":"little"}}]}"#, + ), + ) + .await + .unwrap(); + + // Create Arrow arrays with one empty path (should be skipped) + let locations = StringArray::from(vec!["s3://bucket/file1.dat", "", "s3://bucket/file3.dat"]); + let offsets = UInt64Array::from(vec![0, 100, 200]); + let lengths = UInt64Array::from(vec![50, 50, 50]); + + let result = store + .set_virtual_refs_from_arrow( + &"/array".try_into().unwrap(), + &[3], + &locations, + &offsets, + &lengths, + None, + None, + false, + ) + .await + .unwrap(); + + // Should succeed - empty path at index 1 should be skipped + assert!(matches!(result, SetVirtualRefsResult::Done)); + + // Verify only 2 chunks were written (indices 0 and 2, not 1) + let keys: Vec = store + .list_prefix("array") + .await + .unwrap() + .try_collect() + .await + .unwrap(); + // Should have zarr.json + 2 chunks (c/0 and c/2) + assert_eq!(keys.len(), 3); + assert!(keys.contains(&"array/zarr.json".to_string())); + assert!(keys.contains(&"array/c/0".to_string())); + assert!(keys.contains(&"array/c/2".to_string())); + assert!(!keys.contains(&"array/c/1".to_string())); + } } diff --git a/icechunk/tests/test_virtual_refs.rs b/icechunk/tests/test_virtual_refs.rs index 5b3b1f0ed..b83d9a10a 100644 --- a/icechunk/tests/test_virtual_refs.rs +++ b/icechunk/tests/test_virtual_refs.rs @@ -1139,6 +1139,8 @@ async fn test_set_virtual_refs_from_arrow() -> Result<(), Box Date: Mon, 26 Jan 2026 23:15:49 -0600 Subject: [PATCH 10/11] deal with empty chunks being represented by nulls --- icechunk/src/store.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/icechunk/src/store.rs b/icechunk/src/store.rs index f08b10e5f..a2a791772 100644 --- a/icechunk/src/store.rs +++ b/icechunk/src/store.rs @@ -503,9 +503,9 @@ impl Store { } } - // Build refs by iterating Arrow arrays, skipping empty paths + // Build refs by iterating Arrow arrays, skipping null entries (missing chunks) let refs_iter: Vec<(ChunkIndices, VirtualChunkRef)> = (0..n_chunks) - .filter(|&i| !locations.value(i).is_empty()) + .filter(|&i| !locations.is_null(i)) .map(|i| { let indices = flat_to_nd_indices(i, chunk_grid_shape, arr_offset); let location = VirtualChunkLocation::from_absolute_path(locations.value(i)) @@ -2534,7 +2534,7 @@ mod tests { #[cfg(feature = "arrow")] #[tokio::test] - async fn test_set_virtual_refs_from_arrow_skips_empty_paths() { + async fn test_set_virtual_refs_from_arrow_skips_nulls() { use arrow_array::{StringArray, UInt64Array}; let repo = create_memory_store_repository().await; @@ -2560,10 +2560,10 @@ mod tests { .await .unwrap(); - // Create Arrow arrays with one empty path (should be skipped) - let locations = StringArray::from(vec!["s3://bucket/file1.dat", "", "s3://bucket/file3.dat"]); - let offsets = UInt64Array::from(vec![0, 100, 200]); - let lengths = UInt64Array::from(vec![50, 50, 50]); + // Create Arrow arrays with one null entry (should be skipped) + let locations = StringArray::from(vec![Some("s3://bucket/file1.dat"), None, Some("s3://bucket/file3.dat")]); + let offsets = UInt64Array::from(vec![Some(0), None, Some(200)]); + let lengths = UInt64Array::from(vec![Some(50), None, Some(50)]); let result = store .set_virtual_refs_from_arrow( @@ -2579,7 +2579,7 @@ mod tests { .await .unwrap(); - // Should succeed - empty path at index 1 should be skipped + // Should succeed - null entry at index 1 should be skipped assert!(matches!(result, SetVirtualRefsResult::Done)); // Verify only 2 chunks were written (indices 0 and 2, not 1) From eb30703485daf4b67fc04e846744b08d6ba6563f Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Tue, 10 Mar 2026 13:57:02 -0400 Subject: [PATCH 11/11] Fix stale APIs after merge from main - change_set.rs: remove ManifestSplits/ManifestExtents from set_chunk_refs (splits removed in main) - session.rs: fix borrow issues in set_node_chunk_ref, remove get_splits from set_node_chunk_refs - store.rs: use from_url instead of private from_absolute_path Co-Authored-By: Claude Opus 4.6 --- icechunk/src/change_set.rs | 16 ++-------------- icechunk/src/session.rs | 11 ++++------- icechunk/src/store.rs | 2 +- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs index b9d62ccc8..63cee1ad0 100644 --- a/icechunk/src/change_set.rs +++ b/icechunk/src/change_set.rs @@ -463,7 +463,6 @@ impl ChangeSet { &mut self, node_id: NodeId, chunks: I, - splits: &ManifestSplits, ) -> SessionResult<()> where I: IntoIterator)>, @@ -473,21 +472,10 @@ impl ChangeSet { let node_chunks = edits .set_chunks .entry(node_id) - .or_insert_with(|| { - HashMap::< - ManifestExtents, - BTreeMap>, - >::with_capacity(splits.len()) - }); + .or_default(); for (coord, data) in chunks { - #[allow(clippy::expect_used)] - let extent = splits.find(&coord).expect("logic bug. Trying to set chunk ref but can't find the appropriate split manifest."); - - let old = node_chunks - .entry(extent.clone()) - .or_default() - .insert(coord, data); + let old = node_chunks.insert(coord, data); if old.is_none() { edits.num_chunks += 1; diff --git a/icechunk/src/session.rs b/icechunk/src/session.rs index eb93431b6..ea91acb3f 100644 --- a/icechunk/src/session.rs +++ b/icechunk/src/session.rs @@ -732,9 +732,9 @@ impl Session { coord: ChunkIndices, data: Option, ) -> SessionResult<()> { - if let NodeData::Array { shape, .. } = node.node_data { + if let NodeData::Array { ref shape, .. } = node.node_data { if shape.valid_chunk_coord(&coord) { - self.change_set_mut()?.set_chunk_ref(node.id, coord, data)?; + self.change_set_mut()?.set_chunk_ref(node.id.clone(), coord, data)?; Ok(()) } else { Err(SessionErrorKind::InvalidIndex { @@ -763,11 +763,8 @@ impl Session { where I: IntoIterator)>, { - if let NodeData::Array { shape, dimension_names, .. } = &node.node_data { - let splits = self - .get_splits(&node.id, &node.path, shape, dimension_names) - .clone(); - self.change_set_mut()?.set_chunk_refs(node.id.clone(), chunks, &splits)?; + if let NodeData::Array { .. } = &node.node_data { + self.change_set_mut()?.set_chunk_refs(node.id.clone(), chunks)?; Ok(()) } else { Err(SessionErrorKind::NotAnArray { diff --git a/icechunk/src/store.rs b/icechunk/src/store.rs index 02e462c94..bbe87bd05 100644 --- a/icechunk/src/store.rs +++ b/icechunk/src/store.rs @@ -517,7 +517,7 @@ impl Store { .filter(|&i| !locations.is_null(i)) .map(|i| { let indices = flat_to_nd_indices(i, chunk_grid_shape, arr_offset); - let location = VirtualChunkLocation::from_absolute_path(locations.value(i)) + let location = VirtualChunkLocation::from_url(locations.value(i)) .map_err(|e| StoreErrorKind::Other(e.to_string()))?; let vref = VirtualChunkRef { location,