Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 120 additions & 36 deletions src/imageops/sample.rs
Original file line number Diff line number Diff line change
Expand Up @@ -507,53 +507,137 @@ where

let mut out = ImageBuffer::new(width, new_height);
out.copy_color_space_from(&image.buffer_with_dimensions(0, 0));
let mut ws = Vec::new();

let ratio = height as f32 / new_height as f32;
let sratio = if ratio < 1.0 { 1.0 } else { ratio };
let src_support = filter.support * sratio;

for outy in 0..new_height {
// For an explanation of this algorithm, see the comments
// in horizontal_sample.
let inputy = (outy as f32 + 0.5) * ratio;

let left = (inputy - src_support).floor() as i64;
let left = clamp(left, 0, <i64 as From<_>>::from(height) - 1) as u32;

let right = (inputy + src_support).ceil() as i64;
let right = clamp(
right,
<i64 as From<_>>::from(left) + 1,
<i64 as From<_>>::from(height),
) as u32;

let inputy = inputy - 0.5;

ws.clear();
let mut sum = 0.0;
for i in left..right {
let w = (filter.kernel)((i as f32 - inputy) / sratio);
ws.push(w);
sum += w;
}
for w in ws.iter_mut() {
*w /= sum;
}
let row_count = new_height as usize;
let max_ks = (2.0 * src_support).ceil() as usize + 2;

for x in 0..width {
let mut pix = crate::Rgba([1.0; 4]);
// Max memory usage for weights
const MAX_WEIGHT_FLOATS: usize = 1 << 20; // 4MiB f32

for (i, w) in ws.iter().enumerate() {
let p = image.get_pixel(x, left + i as u32);
// Number of output rows whose weights fit in the budget
let batch_size = (MAX_WEIGHT_FLOATS / max_ks.max(1)).max(1).min(row_count);

for (tc, &c) in pix.channels_mut().iter_mut().zip(p.channels()) {
*tc += <f32 as NumCast>::from(c).unwrap() * w;
}
let src_stride = width as usize * MAX_CHANNEL;

// Sliding window cache for pre-converted source rows.
// Disable cache when the filter is trivial (e.g. Nearest)
// or the image is too wide for the memory budget.
let src_row_budget = MAX_WEIGHT_FLOATS / src_stride.max(1);
let cache_capacity = if max_ks > sratio as usize && src_row_budget > max_ks {
max_ks + 1
} else {
0
};
let mut cache_buf = vec![0.0f32; cache_capacity * src_stride];
let mut cache_base: usize = 0; // source-row index of the oldest cached slot
let mut cache_size: usize = 0; // number of rows currently valid in the cache

let out_stride = width as usize * MAX_CHANNEL;
let out_raw = out.as_mut();

// Reusable weight buffers shared across batches
let mut batch_ws: Vec<f32> = Vec::new();
let mut batch_lefts: Vec<usize> =
vec_try_with_capacity(batch_size).expect("capacity overflow in vertical_sample");
let mut batch_starts: Vec<usize> =
vec_try_with_capacity(batch_size + 1).expect("capacity overflow in vertical_sample");

let mut batch_start = 0;
while batch_start < row_count {
let batch_end = batch_start.saturating_add(batch_size).min(row_count);

// Precompute weights for every output row in this batch.
batch_ws.clear();
batch_lefts.clear();
batch_starts.clear();
batch_starts.push(0);

for outy in batch_start..batch_end {
// Use f64 to avoid precision loss for large dimensions
let inputy = (outy as f64 + 0.5) * ratio as f64;

// Invariant: 0 <= left < right <= height
let left = clamp((inputy - src_support as f64) as u32, 0, height - 1);

let right = (inputy + src_support as f64).ceil() as i64;
let right = clamp(right, left as i64 + 1, height as i64) as u32;

// Go back to left boundary of pixel, to properly compare with i
// below, as the kernel treats the centre of a pixel as 0.
let inputy = inputy as f32 - 0.5;

batch_lefts.push(left as usize);
let ws_start = batch_ws.len();
let mut sum = 0.0;
for i in left..right {
let w = (filter.kernel)((i as f32 - inputy) / sratio);
batch_ws.push(w);
sum += w;
}
for w in batch_ws[ws_start..].iter_mut() {
*w /= sum;
}
batch_starts.push(batch_ws.len());
}

out.put_pixel(x, outy, pix);
for (b, outy) in (batch_start..batch_end).enumerate() {
let left = batch_lefts[b];
let ws = &batch_ws[batch_starts[b]..batch_starts[b + 1]];
let out_row = &mut out_raw[outy * out_stride..(outy + 1) * out_stride];

out_row.fill(0.0);

if cache_capacity > 0 {
// Sliding window: evict rows before `left`, then fill forward
// until all taps [left, right) are cached.
let right = left + ws.len();
let evict = left.saturating_sub(cache_base).min(cache_size);
cache_base += evict;
cache_size -= evict;

while cache_base + cache_size < right {
let row = cache_base + cache_size;
let slot = row % cache_capacity;
let row_buf = &mut cache_buf[slot * src_stride..(slot + 1) * src_stride];
for x in 0..width {
let p = image.get_pixel(x, row as u32);
let base = x as usize * MAX_CHANNEL;
for (j, &c) in p.channels().iter().enumerate() {
row_buf[base + j] = <f32 as NumCast>::from(c).unwrap();
}
}
cache_size += 1;
}

// All taps guaranteed cached — direct access, no miss check.
for (k, &w) in ws.iter().enumerate() {
let src_row = left + k;
let slot = src_row % cache_capacity;
let src = &cache_buf[slot * src_stride..(slot + 1) * src_stride];
for (d, &s) in out_row.iter_mut().zip(src.iter()) {
*d += s * w;
}
}
} else {
// No cache (e.g. Nearest): convert source rows on the fly.
for (k, &w) in ws.iter().enumerate() {
let src_row = (left + k) as u32;
for x in 0..width {
let p = image.get_pixel(x, src_row);
let base = x as usize * MAX_CHANNEL;
for (j, &c) in p.channels().iter().enumerate() {
out_row[base + j] += <f32 as NumCast>::from(c).unwrap() * w;
}
}
}
}
}

batch_start = batch_end;
}

out
Expand Down
Loading