diff --git a/src/imageops/sample.rs b/src/imageops/sample.rs index 3e95dd68e6..2cda00fb8d 100644 --- a/src/imageops/sample.rs +++ b/src/imageops/sample.rs @@ -507,53 +507,137 @@ where let mut out = ImageBuffer::new(width, new_height); out.copy_color_space_from(&image.buffer_with_dimensions(0, 0)); - let mut ws = Vec::new(); let ratio = height as f32 / new_height as f32; let sratio = if ratio < 1.0 { 1.0 } else { ratio }; let src_support = filter.support * sratio; - for outy in 0..new_height { - // For an explanation of this algorithm, see the comments - // in horizontal_sample. - let inputy = (outy as f32 + 0.5) * ratio; - - let left = (inputy - src_support).floor() as i64; - let left = clamp(left, 0, >::from(height) - 1) as u32; - - let right = (inputy + src_support).ceil() as i64; - let right = clamp( - right, - >::from(left) + 1, - >::from(height), - ) as u32; - - let inputy = inputy - 0.5; - - ws.clear(); - let mut sum = 0.0; - for i in left..right { - let w = (filter.kernel)((i as f32 - inputy) / sratio); - ws.push(w); - sum += w; - } - for w in ws.iter_mut() { - *w /= sum; - } + let row_count = new_height as usize; + let max_ks = (2.0 * src_support).ceil() as usize + 2; - for x in 0..width { - let mut pix = crate::Rgba([1.0; 4]); + // Max memory usage for weights + const MAX_WEIGHT_FLOATS: usize = 1 << 20; // 4MiB f32 - for (i, w) in ws.iter().enumerate() { - let p = image.get_pixel(x, left + i as u32); + // Number of output rows whose weights fit in the budget + let batch_size = (MAX_WEIGHT_FLOATS / max_ks.max(1)).max(1).min(row_count); - for (tc, &c) in pix.channels_mut().iter_mut().zip(p.channels()) { - *tc += ::from(c).unwrap() * w; - } + let src_stride = width as usize * MAX_CHANNEL; + + // Sliding window cache for pre-converted source rows. + // Disable cache when the filter is trivial (e.g. Nearest) + // or the image is too wide for the memory budget. + let src_row_budget = MAX_WEIGHT_FLOATS / src_stride.max(1); + let cache_capacity = if max_ks > sratio as usize && src_row_budget > max_ks { + max_ks + 1 + } else { + 0 + }; + let mut cache_buf = vec![0.0f32; cache_capacity * src_stride]; + let mut cache_base: usize = 0; // source-row index of the oldest cached slot + let mut cache_size: usize = 0; // number of rows currently valid in the cache + + let out_stride = width as usize * MAX_CHANNEL; + let out_raw = out.as_mut(); + + // Reusable weight buffers shared across batches + let mut batch_ws: Vec = Vec::new(); + let mut batch_lefts: Vec = + vec_try_with_capacity(batch_size).expect("capacity overflow in vertical_sample"); + let mut batch_starts: Vec = + vec_try_with_capacity(batch_size + 1).expect("capacity overflow in vertical_sample"); + + let mut batch_start = 0; + while batch_start < row_count { + let batch_end = batch_start.saturating_add(batch_size).min(row_count); + + // Precompute weights for every output row in this batch. + batch_ws.clear(); + batch_lefts.clear(); + batch_starts.clear(); + batch_starts.push(0); + + for outy in batch_start..batch_end { + // Use f64 to avoid precision loss for large dimensions + let inputy = (outy as f64 + 0.5) * ratio as f64; + + // Invariant: 0 <= left < right <= height + let left = clamp((inputy - src_support as f64) as u32, 0, height - 1); + + let right = (inputy + src_support as f64).ceil() as i64; + let right = clamp(right, left as i64 + 1, height as i64) as u32; + + // Go back to left boundary of pixel, to properly compare with i + // below, as the kernel treats the centre of a pixel as 0. + let inputy = inputy as f32 - 0.5; + + batch_lefts.push(left as usize); + let ws_start = batch_ws.len(); + let mut sum = 0.0; + for i in left..right { + let w = (filter.kernel)((i as f32 - inputy) / sratio); + batch_ws.push(w); + sum += w; + } + for w in batch_ws[ws_start..].iter_mut() { + *w /= sum; } + batch_starts.push(batch_ws.len()); + } - out.put_pixel(x, outy, pix); + for (b, outy) in (batch_start..batch_end).enumerate() { + let left = batch_lefts[b]; + let ws = &batch_ws[batch_starts[b]..batch_starts[b + 1]]; + let out_row = &mut out_raw[outy * out_stride..(outy + 1) * out_stride]; + + out_row.fill(0.0); + + if cache_capacity > 0 { + // Sliding window: evict rows before `left`, then fill forward + // until all taps [left, right) are cached. + let right = left + ws.len(); + let evict = left.saturating_sub(cache_base).min(cache_size); + cache_base += evict; + cache_size -= evict; + + while cache_base + cache_size < right { + let row = cache_base + cache_size; + let slot = row % cache_capacity; + let row_buf = &mut cache_buf[slot * src_stride..(slot + 1) * src_stride]; + for x in 0..width { + let p = image.get_pixel(x, row as u32); + let base = x as usize * MAX_CHANNEL; + for (j, &c) in p.channels().iter().enumerate() { + row_buf[base + j] = ::from(c).unwrap(); + } + } + cache_size += 1; + } + + // All taps guaranteed cached — direct access, no miss check. + for (k, &w) in ws.iter().enumerate() { + let src_row = left + k; + let slot = src_row % cache_capacity; + let src = &cache_buf[slot * src_stride..(slot + 1) * src_stride]; + for (d, &s) in out_row.iter_mut().zip(src.iter()) { + *d += s * w; + } + } + } else { + // No cache (e.g. Nearest): convert source rows on the fly. + for (k, &w) in ws.iter().enumerate() { + let src_row = (left + k) as u32; + for x in 0..width { + let p = image.get_pixel(x, src_row); + let base = x as usize * MAX_CHANNEL; + for (j, &c) in p.channels().iter().enumerate() { + out_row[base + j] += ::from(c).unwrap() * w; + } + } + } + } } + + batch_start = batch_end; } out