From 8a89fb8922de59925141c6069cd5cda23e248703 Mon Sep 17 00:00:00 2001 From: Yonghye Kwon Date: Sun, 15 Mar 2026 23:41:34 +0900 Subject: [PATCH 1/2] perf: optimize CPU deform_conv2d forward pass Three changes to the CPU deformable convolution forward kernel: 1. Replace at::zeros with at::empty for columns and out_buf buffers. The deformable_im2col_kernel writes every element of the columns buffer, and out_buf is fully written by addmm_, so zero-initialization is wasted work. 2. Use addmm_ with beta=0 instead of the default beta=1. This avoids accumulating into uninitialized memory while preserving in-place operation (no extra allocation unlike at::mm). 3. Parallelize deformable_im2col_kernel with at::parallel_for. The im2col loop was the only single-threaded phase in the forward pass (GEMM is already parallelized by BLAS). Each loop iteration writes to a non-overlapping region of the columns buffer, so parallelization is safe. Benchmark results on Apple M2 (CPU, float32): Config Before (ms) After (ms) Change small-b1 9.76 2.44 -75% small-b8 91.77 33.88 -63% medium-b1 216.70 75.80 -65% medium-b8 1152.09 650.00 -44% large-b1 348.86 302.70 -13% large-b4 1342.75 1289.96 -4% Signed-off-by: Yonghye Kwon Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Yonghye Kwon --- .../csrc/ops/cpu/deform_conv2d_kernel.cpp | 100 +++++++++--------- 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp b/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp index f89e6cc3030..cd4b21a32b2 100644 --- a/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp +++ b/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp @@ -68,6 +68,7 @@ // https://github.com/open-mmlab/mmdetection/blob/master/mmdet/ops/dcn/src/deform_conv_cuda.cpp #include +#include #include namespace vision { @@ -139,58 +140,60 @@ void deformable_im2col_kernel( int out_w, bool use_mask, scalar_t* columns) { - for (int index = 0; index != n; ++index) { - const int out_x = index % out_w; - const int out_y = (index / out_w) % out_h; - const int out_b = (index / (out_w * out_h)) % batch_sz; - const int in_c = index / (out_w * out_h * batch_sz); - const int out_c = in_c * weight_h * weight_w; + at::parallel_for(0, n, 0, [&](int64_t begin, int64_t end) { + for (int64_t index = begin; index != end; ++index) { + const int out_x = index % out_w; + const int out_y = (index / out_w) % out_h; + const int out_b = (index / (out_w * out_h)) % batch_sz; + const int in_c = index / (out_w * out_h * batch_sz); + const int out_c = in_c * weight_h * weight_w; - int c_per_offset_grp = n_in_channels / n_offset_grps; - const int grp_idx = in_c / c_per_offset_grp; + int c_per_offset_grp = n_in_channels / n_offset_grps; + const int grp_idx = in_c / c_per_offset_grp; - auto columns_ptr = columns + - (out_c * (batch_sz * out_h * out_w) + out_b * (out_h * out_w) + - out_y * out_w + out_x); + auto columns_ptr = columns + + (out_c * (batch_sz * out_h * out_w) + out_b * (out_h * out_w) + + out_y * out_w + out_x); - auto input_ptr = input + - (out_b * (n_in_channels * height * width) + in_c * (height * width)); + auto input_ptr = input + + (out_b * (n_in_channels * height * width) + in_c * (height * width)); - auto offset_ptr = offset + - (out_b * n_offset_grps + grp_idx) * 2 * weight_h * weight_w * out_h * - out_w; + auto offset_ptr = offset + + (out_b * n_offset_grps + grp_idx) * 2 * weight_h * weight_w * out_h * + out_w; - auto mask_ptr = mask; - if (use_mask) { - mask_ptr += (out_b * n_offset_grps + grp_idx) * weight_h * weight_w * - out_h * out_w; - } - - for (int i = 0; i < weight_h; ++i) { - for (int j = 0; j < weight_w; ++j) { - const int mask_idx = i * weight_w + j; - const int offset_idx = 2 * mask_idx; + auto mask_ptr = mask; + if (use_mask) { + mask_ptr += (out_b * n_offset_grps + grp_idx) * weight_h * weight_w * + out_h * out_w; + } - scalar_t mask_value = 1; - if (use_mask) { - mask_value = - mask_ptr[mask_idx * (out_h * out_w) + out_y * out_w + out_x]; + for (int i = 0; i < weight_h; ++i) { + for (int j = 0; j < weight_w; ++j) { + const int mask_idx = i * weight_w + j; + const int offset_idx = 2 * mask_idx; + + scalar_t mask_value = 1; + if (use_mask) { + mask_value = + mask_ptr[mask_idx * (out_h * out_w) + out_y * out_w + out_x]; + } + + const scalar_t offset_h = + offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x]; + const scalar_t offset_w = offset_ptr + [(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x]; + const scalar_t y = + (out_y * stride_h - pad_h) + i * dilation_h + offset_h; + const scalar_t x = + (out_x * stride_w - pad_w) + j * dilation_w + offset_w; + *columns_ptr = + mask_value * bilinear_interpolate(input_ptr, height, width, y, x); + columns_ptr += batch_sz * out_h * out_w; } - - const scalar_t offset_h = - offset_ptr[offset_idx * (out_h * out_w) + out_y * out_w + out_x]; - const scalar_t offset_w = offset_ptr - [(offset_idx + 1) * (out_h * out_w) + out_y * out_w + out_x]; - const scalar_t y = - (out_y * stride_h - pad_h) + i * dilation_h + offset_h; - const scalar_t x = - (out_x * stride_w - pad_w) + j * dilation_w + offset_w; - *columns_ptr = - mask_value * bilinear_interpolate(input_ptr, height, width, y, x); - columns_ptr += batch_sz * out_h * out_w; } } - } + }); } void deformable_im2col( @@ -1013,7 +1016,7 @@ at::Tensor deform_conv2d_forward_kernel( out_w}); } - at::Tensor out_buf = at::zeros( + at::Tensor out_buf = at::empty( {batch_sz / n_parallel_imgs, out_channels, n_parallel_imgs * out_h, @@ -1035,7 +1038,7 @@ at::Tensor deform_conv2d_forward_kernel( weight_c.size(3)}); // Sample points and perform convolution - auto columns = at::zeros( + auto columns = at::empty( {n_in_channels * weight_h * weight_w, n_parallel_imgs * out_h * out_w}, input_c.options()); for (int b = 0; b < batch_sz / n_parallel_imgs; b++) { @@ -1064,10 +1067,9 @@ at::Tensor deform_conv2d_forward_kernel( columns = columns.view( {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)}); for (int g = 0; g < n_weight_grps; g++) { - out_buf[b][g] = out_buf[b][g] - .flatten(1) - .addmm_(weight_c[g].flatten(1), columns[g]) - .view_as(out_buf[b][g]); + out_buf[b][g] + .flatten(1) + .addmm_(weight_c[g].flatten(1), columns[g], 0, 1); } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)}); From 137d2b79afd04944fcebd52c87d85b3aad2a1f93 Mon Sep 17 00:00:00 2001 From: Yonghye Kwon Date: Tue, 17 Mar 2026 12:41:11 +0900 Subject: [PATCH 2/2] style: fix clang-format lint for method chain in deform_conv2d_kernel Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Yonghye Kwon --- torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp b/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp index cd4b21a32b2..b6389189b80 100644 --- a/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp +++ b/torchvision/csrc/ops/cpu/deform_conv2d_kernel.cpp @@ -1067,9 +1067,7 @@ at::Tensor deform_conv2d_forward_kernel( columns = columns.view( {n_weight_grps, columns.size(0) / n_weight_grps, columns.size(1)}); for (int g = 0; g < n_weight_grps; g++) { - out_buf[b][g] - .flatten(1) - .addmm_(weight_c[g].flatten(1), columns[g], 0, 1); + out_buf[b][g].flatten(1).addmm_(weight_c[g].flatten(1), columns[g], 0, 1); } columns = columns.view({columns.size(0) * columns.size(1), columns.size(2)});