diff --git a/.gitignore b/.gitignore index c44a8393..77f2eee9 100644 --- a/.gitignore +++ b/.gitignore @@ -47,3 +47,5 @@ oxford # Downloaded archives for tests. *.tgz + +build-hip/ diff --git a/CMakeLists.txt b/CMakeLists.txt index a6a7ad64..07bda8ac 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,26 +1,38 @@ # First-order language CUDA requires at least CMake 3.18 cmake_minimum_required(VERSION 3.24) -# Default seletion of CUDA Compute Capabilities. -# This must be called before project() or cmake sets it to the oldest non-deprecated CC -# "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson. -if(EXISTS "/etc/nv_tegra_release") - # The CC list for Tegras and Jetson will require manual updates - set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87" - CACHE - STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons") -else() - # The CC list for discrete GPUs will require CMake updates - set(CMAKE_CUDA_ARCHITECTURES "all-major" - CACHE - STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons") +# USE_HIP routes the .cu translation units through the HIP toolchain for AMD +# GPUs. Everything else keeps its plain +# CUDA spelling; cuda_to_hip.h (force-included for HIP in src/CMakeLists.txt) +# does the symbol translation. The NVIDIA path (USE_HIP=OFF) is unchanged. +option(USE_HIP "Build with HIP for AMD GPUs instead of CUDA" OFF) + +if(NOT USE_HIP) + # Default seletion of CUDA Compute Capabilities. + # This must be called before project() or cmake sets it to the oldest non-deprecated CC + # "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson. + if(EXISTS "/etc/nv_tegra_release") + # The CC list for Tegras and Jetson will require manual updates + set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87" + CACHE + STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons") + else() + # The CC list for discrete GPUs will require CMake updates + set(CMAKE_CUDA_ARCHITECTURES "all-major" + CACHE + STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons") + endif() endif() -project(PopSift VERSION 0.10.1 LANGUAGES CXX CUDA) +if(USE_HIP) + project(PopSift VERSION 0.10.1 LANGUAGES CXX HIP) +else() + project(PopSift VERSION 0.10.1 LANGUAGES CXX CUDA) -# Policy to support CUDA as a first-order language for CMake. -# Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html -cmake_policy(SET CMP0104 NEW) + # Policy to support CUDA as a first-order language for CMake. + # Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html + cmake_policy(SET CMP0104 NEW) +endif() # Set build path as a folder named as the platform (linux, windows, darwin...) plus the processor type set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}") @@ -75,70 +87,94 @@ if(BUILD_SHARED_LIBS) # Auto-build dll exports on Windows set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) - set(CMAKE_CUDA_RUNTIME_LIBRARY Shared) + if(NOT USE_HIP) + set(CMAKE_CUDA_RUNTIME_LIBRARY Shared) + endif() else() message(STATUS "BUILD_SHARED_LIBS OFF") - set(CMAKE_CUDA_RUNTIME_LIBRARY Static) + if(NOT USE_HIP) + set(CMAKE_CUDA_RUNTIME_LIBRARY Static) + endif() endif() # Require threads because of std::thread. find_package(Threads REQUIRED) -################### -# CUDA -################### -include(CheckLanguage) -check_language(CUDA) - -# Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path" -# target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart) -find_package(CUDAToolkit) - -message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}") -set(CUDA_VERSION ${CUDAToolkit_VERSION}) - -if(PopSift_ERRCHK_AFTER_KERNEL) - message(STATUS "Synchronizing and checking errors after every kernel call") - list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL") -endif() - -# This may not be required any more. -set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - -# default stream per-thread implies that each host thread has one non-synchronizing 0-stream -# currently, the code requires legacy mode -list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy") -# set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--default-stream;per-thread") +if(USE_HIP) + ################### + # HIP (AMD) + ################### + # rocThrust supplies the thrust::cuda::par compatibility namespace, so the + # Thrust code in s_filtergrid.cu compiles unchanged on ROCm. + find_package(rocthrust REQUIRED CONFIG) + + set(PopSift_CXX_STANDARD 17) + set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD}) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + set(CMAKE_HIP_STANDARD ${PopSift_CXX_STANDARD}) + set(CMAKE_HIP_STANDARD_REQUIRED ON) + set(CMAKE_HIP_SEPARABLE_COMPILATION ON) + + # HIP has __shfl_down etc. unconditionally; assist.h selects the _sync + # spelling on this flag and cuda_to_hip.h maps _sync to the mask-free builtin. + set(PopSift_HAVE_SHFL_DOWN_SYNC 1) +else() + ################### + # CUDA + ################### + include(CheckLanguage) + check_language(CUDA) + + # Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path" + # target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart) + find_package(CUDAToolkit) + + message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}") + set(CUDA_VERSION ${CUDAToolkit_VERSION}) + + if(PopSift_ERRCHK_AFTER_KERNEL) + message(STATUS "Synchronizing and checking errors after every kernel call") + list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL") + endif() -if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5") - if(PopSift_NVCC_WARNINGS) - list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage") - list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills") - list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage") - list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills") + # This may not be required any more. + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + + # default stream per-thread implies that each host thread has one non-synchronizing 0-stream + # currently, the code requires legacy mode + list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy") + # set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--default-stream;per-thread") + + if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5") + if(PopSift_NVCC_WARNINGS) + list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage") + list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills") + list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage") + list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills") + endif() endif() -endif() -set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11 -if(CUDA_VERSION_MAJOR LESS_EQUAL 8) - set(PopSift_CXX_STANDARD 11) -endif() + set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11 + if(CUDA_VERSION_MAJOR LESS_EQUAL 8) + set(PopSift_CXX_STANDARD 11) + endif() -if(NOT MSVC) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}") - list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}") -endif() -set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD}) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD}) -set(CMAKE_CUDA_STANDARD_REQUIRED ON) + if(NOT MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}") + list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}") + endif() + set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD}) + set(CMAKE_CXX_STANDARD_REQUIRED ON) + set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD}) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) -if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0") - set(PopSift_HAVE_SHFL_DOWN_SYNC 1) -else() - set(PopSift_HAVE_SHFL_DOWN_SYNC 0) + if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0") + set(PopSift_HAVE_SHFL_DOWN_SYNC 1) + else() + set(PopSift_HAVE_SHFL_DOWN_SYNC 0) + endif() endif() if(NOT PopSift_USE_GRID_FILTER) diff --git a/README.md b/README.md index bcab5120..c368710e 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ PopSift compiles and works with NVidia cards of compute capability >= 3.0 (inclu CUDA SDK 11 does no longer support compute capability 3.0. 3.5 is still supported with deprecation warnings. +PopSift also runs on AMD GPUs through ROCm/HIP (validated on gfx90a and gfx1100, and on Windows gfx1151); see the `USE_HIP` build option below. + ## Dependencies PopSift depends on: @@ -46,6 +48,42 @@ Some build options are available: * `BUILD_SHARED_LIBS` (default: `ON`) controls the type of library to build (`ON` for shared libraries, `OFF` for static) +* `USE_HIP` (default: `OFF`) builds for AMD GPUs through ROCm/HIP instead of CUDA (requires a ROCm installation); see "Building for AMD GPUs (ROCm/HIP)" below. + +### Building for AMD GPUs (ROCm/HIP) + +With a ROCm installation, configure with `-DUSE_HIP=ON` to build the GPU code through HIP instead of CUDA. The CUDA build path is unchanged when `USE_HIP=OFF` (the default). + +```shell +cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a \ + -DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \ + -DCMAKE_PREFIX_PATH=/opt/rocm +make +``` + +Point `-DCMAKE_PREFIX_PATH` at the ROCm install prefix (`/opt/rocm` for a default install) so CMake can locate the `hip` and `rocThrust` packages when ROCm is not already on the search path. + +Set the target architecture with `-DCMAKE_HIP_ARCHITECTURES` (e.g. `gfx90a`, `gfx1100`); pass a semicolon-separated list such as `"gfx90a;gfx1100"` to emit a fat binary. + +#### Windows (gfx1151 / TheRock ROCm) + +Tested with the ROCm wheels from [TheRock](https://github.com/ROCm/TheRock) (see its README for installing the Windows wheels). The steps below assume `` is that install prefix and that `clang` from it is on `PATH`: + +```shell +cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1151 ^ + -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang -DCMAKE_HIP_COMPILER=clang ^ + -DCMAKE_PREFIX_PATH= -DPopSift_BUILD_EXAMPLES=OFF ^ + -DCMAKE_LINKER_TYPE=LLDFIX ^ + -DCMAKE_C_USING_LINKER_LLDFIX=-fuse-ld=lld ^ + -DCMAKE_CXX_USING_LINKER_LLDFIX=-fuse-ld=lld ^ + -DCMAKE_HIP_USING_LINKER_LLDFIX=-fuse-ld=lld +cmake --build . --config Release +``` + +The `CMAKE_LINKER_TYPE=LLDFIX` block works around the `-fgpu-rdc` device link: CMake otherwise emits `-fuse-ld=lld-link`, which AMD clang rejects under `--hip-link`. At runtime, copy a matching `amdhip64` and `amd_comgr` from `` next to the executable (a System32 Adrenalin-driver runtime can be device-library mismatched). + +On the gfx1151 APU, `maxTexture2DLayered` is smaller than on gfx90a, so the default upscale can overflow for large images; use native-resolution downsampling (`Config::setDownsampling(0)`) or cap the input size. + ## Usage The main artifact created is `libpopsift`. diff --git a/doc/sphinx/source/install/install.rst b/doc/sphinx/source/install/install.rst index da6d1dde..0f2d2dc7 100644 --- a/doc/sphinx/source/install/install.rst +++ b/doc/sphinx/source/install/install.rst @@ -33,6 +33,8 @@ Software The core library depends only on Cuda >= 7.0 +PopSift also builds for AMD GPUs through ROCm/HIP instead of CUDA, requiring ROCm 7.2 or newer; see the ``USE_HIP`` CMake option below. + The library includes a few sample applications that show how to use the library. They require @@ -173,6 +175,8 @@ CMake configuration can be controlled by changing the values of the following va * :code:`PopSift_BUILD_DOC:BOOL=OFF` to enable/disable building this documentation and the Doxygen one. +* :code:`USE_HIP:BOOL=OFF` to build for AMD GPUs with ROCm/HIP instead of CUDA. Set the target GPU architecture with :code:`CMAKE_HIP_ARCHITECTURES` (for example :code:`gfx90a` for CDNA2 / MI200, or :code:`gfx1100` for RDNA3). Requires a ROCm installation (7.2 or newer) providing HIP. + For example, if you do not want to build the applications, you have to pass :code:`-DPopSift_BUILD_EXAMPLES:BOOL=OFF` and so on. diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 11cb5a6a..5bf4620f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -41,14 +41,71 @@ add_library(popsift popsift/common/vec_macros.h popsift/common/clamp.h) +if(USE_HIP) + # Collect the .cu translation units so the HIP build can mark them LANGUAGE HIP. + # Under USE_HIP, .cu would otherwise default to CUDA (a language not enabled in + # the HIP build), so the language must be set explicitly. This is only needed on + # the HIP path, so the collection lives inside the guard. + get_target_property(_popsift_sources popsift SOURCES) + set(_popsift_cu_sources "") + foreach(_src ${_popsift_sources}) + if(_src MATCHES "\\.cu$") + list(APPEND _popsift_cu_sources "${_src}") + endif() + endforeach() + set_source_files_properties(${_popsift_cu_sources} PROPERTIES LANGUAGE HIP) +endif() + target_link_libraries(popsift PUBLIC - CUDA::cudart Threads::Threads) +if(USE_HIP) + # rocThrust transitively links hip::device, whose INTERFACE_COMPILE_OPTIONS inject + # "-x hip" for COMPILE_LANGUAGE:CXX. Keep that device-compile interface PRIVATE so + # it does not leak "-x hip" onto plain-C++ consumers of popsift (the example apps). + # Consumers still need the HIP runtime to link and its headers to satisfy the + # public include, so expose hip::host PUBLIC (link + headers, no + # "-x hip"). The library .cu sources are already LANGUAGE HIP, so they get the HIP + # compiler regardless of the link visibility. + target_link_libraries(popsift PRIVATE roc::rocthrust) + target_link_libraries(popsift PUBLIC hip::host) +else() + target_link_libraries(popsift PUBLIC CUDA::cudart) +endif() set_target_properties(popsift PROPERTIES VERSION ${PROJECT_VERSION}) set_target_properties(popsift PROPERTIES DEBUG_POSTFIX "d") -set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +if(USE_HIP) + # CMAKE_HIP_ARCHITECTURES is already resolved by project(... LANGUAGES CXX HIP) + # in the top-level CMakeLists: it auto-detects the host GPU arch, honors an + # explicit -DCMAKE_HIP_ARCHITECTURES, and errors on a host with no GPU. Pass + # -DCMAKE_HIP_ARCHITECTURES to override. + set_target_properties(popsift PROPERTIES + HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}") + # USE_HIP selects the HIP branch of cuda_to_hip.h. __HIP_PLATFORM_AMD__ is + # only defined once the HIP runtime header is included, so it cannot gate the + # include of that very header; the explicit define breaks the cycle. + target_compile_definitions(popsift PRIVATE USE_HIP) + # Force-include the compat header into every HIP translation unit so no + # per-file #include edits are needed (minimal footprint). + # -fgpu-rdc keeps device code relocatable so the cross-TU __device__/__constant__ + # globals (d_consts, d_gauss, dct/dbuf/dobuf -- declared extern in headers, + # defined in one .cu, used in many) are resolved by the final --hip-link step. + # Without it each .o is device-linked standalone and those references are + # undefined. The matching -fgpu-rdc on the link rule (below) performs the + # device link. + target_compile_options(popsift PRIVATE + $<$:-include${CMAKE_CURRENT_SOURCE_DIR}/popsift/cuda_to_hip.h> + $<$:-fgpu-rdc>) + target_link_options(popsift PRIVATE -fgpu-rdc --hip-link) + # hip_compat/ supplies a shim for the sources that include it + # directly; HIP build only. PRIVATE so the public popsift.h + # include still resolves to the real CUDA header for downstream NVIDIA users. + target_include_directories(popsift PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/popsift/hip_compat) +else() + set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +endif() if(MSVC) target_compile_options(popsift PRIVATE $<$:-Xcompiler=/Zc:preprocessor>) endif() diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt index a6b8886f..32ed796d 100755 --- a/src/application/CMakeLists.txt +++ b/src/application/CMakeLists.txt @@ -70,7 +70,24 @@ endif() add_executable(popsift-demo main.cpp pgmread.cpp pgmread.h) -set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11) +if(USE_HIP) + # main.cpp/pgmread.cpp have NO device code; they consume popsift's host API only. + # popsift now links the HIP device-compile interface (roc::rocthrust -> hip::device, + # which injects "-x hip") PRIVATE and exposes only hip::host PUBLIC, so these + # sources compile as plain C++ (no "-x hip") and just link the HIP runtime. They + # still need the hip_compat shim and the cuda_to_hip.h force-include + # because popsift's public headers (device_prop.h) #include ; the + # library keeps those PRIVATE, so the consumer adds them explicitly here. USE_HIP + # selects the HIP branch of cuda_to_hip.h. C++17 for the compat header. + set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 17) + target_compile_definitions(popsift-demo PRIVATE USE_HIP) + target_include_directories(popsift-demo PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../popsift/hip_compat) + target_compile_options(popsift-demo PRIVATE + $<$:-include${CMAKE_CURRENT_SOURCE_DIR}/../popsift/cuda_to_hip.h>) +else() + set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11) +endif() target_compile_options(popsift-demo PRIVATE ${PD_COMPILE_OPTIONS} ) target_include_directories(popsift-demo PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS}) @@ -84,7 +101,17 @@ target_link_libraries(popsift-demo PUBLIC PopSift::popsift ${PD_LINK_LIBS}) add_executable(popsift-match match.cpp pgmread.cpp pgmread.h) -set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11) +if(USE_HIP) + # See popsift-demo above: plain-C++ host consumer, no device code, no "-x hip". + set_property(TARGET popsift-match PROPERTY CXX_STANDARD 17) + target_compile_definitions(popsift-match PRIVATE USE_HIP) + target_include_directories(popsift-match PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/../popsift/hip_compat) + target_compile_options(popsift-match PRIVATE + $<$:-include${CMAKE_CURRENT_SOURCE_DIR}/../popsift/cuda_to_hip.h>) +else() + set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11) +endif() target_compile_options(popsift-match PRIVATE ${PD_COMPILE_OPTIONS} ) target_include_directories(popsift-match PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS}) diff --git a/src/popsift/common/assist.h b/src/popsift/common/assist.h index 50e6fe6f..1c52c93d 100644 --- a/src/popsift/common/assist.h +++ b/src/popsift/common/assist.h @@ -56,6 +56,34 @@ template __device__ inline T shuffle_down( T variable, int delta, in template __device__ inline T shuffle_xor ( T variable, int delta, int ws ) { return __shfl_xor ( variable, delta, ws ); } #endif +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) +/* Wave64 sub-warp helpers. + * + * These kernels were written for a 32-lane NVIDIA warp. On a 64-lane CDNA + * wavefront PopSift packs two logical 32-thread rows into one wavefront, so the + * whole-wavefront builtins (__ballot/__any over all 64 lanes) mix the two rows + * and miscount. The helpers below restrict a collective to the caller's own + * 32-lane group (its lane's half of the wavefront), reproducing 32-lane warp + * semantics. The group index is the thread's logical row (threadIdx.y for the + * extrema kernel); a single-row 32-thread block is group 0. CUDA is unaffected: + * this block compiles only on HIP. + */ +__device__ inline unsigned int ballot_group( unsigned int pred, int group ) +{ + const unsigned long long b = __ballot( pred ); + return (unsigned int)( b >> ( group * 32 ) ); +} +__device__ inline int any_group( unsigned int pred, int group ) +{ + const unsigned long long b = __ballot( pred ); + return ( (unsigned int)( b >> ( group * 32 ) ) ) != 0u; +} +#else +__device__ inline unsigned int ballot_group( unsigned int pred, int ) { return popsift::ballot( pred ); } +__device__ inline int any_group( unsigned int pred, int ) { return popsift::any( pred ); } +#endif + + /* This computation is needed very frequently when a dim3 grid block is * initialized. It ensure that the tail is not forgotten. */ @@ -73,8 +101,125 @@ float readTex( cudaTextureObject_t tex, float x, float y, float z ) * we will get the expected cell (or an interpolation very * close by) iff we add 0.5f to X and Y coordinate. */ +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + /* Observed on gfx90a, ROCm 7.2.1: hipCreateTextureObject rejects a + * hardware-linear-filtered texture over an element-read float array (see + * sift_octave.cu), so the linear + * textures are created with point filtering and we reproduce CUDA's bilinear + * filter in software here. This is an empirical limitation on this device/ROCm; + * it may not hold on other arches (re-verify on RDNA). + * CUDA's unnormalized linear filter on tex2DLayered(c) samples at index c-0.5, + * i.e. i0=floor(c-0.5) with weight frac=(c-0.5)-i0; readTex passes c=x+0.5, + * so i0=floor(x), frac=x-floor(x). Point textures (used with integer x,y) hit + * frac==0 and reduce to the exact texel, so this path is correct for them too. + * SIFT blurs are per-layer; z is always an integer level, so interpolate in x,y + * at the fixed layer z. Point-filtered fetch of texel ix is tex2DLayered(ix+0.5). + */ + const float fx = floorf( x ); + const float fy = floorf( y ); + const float ax = x - fx; + const float ay = y - fy; + const float t00 = tex2DLayered( tex, fx + 0.5f, fy + 0.5f, z ); + const float t10 = tex2DLayered( tex, fx + 1.5f, fy + 0.5f, z ); + const float t01 = tex2DLayered( tex, fx + 0.5f, fy + 1.5f, z ); + const float t11 = tex2DLayered( tex, fx + 1.5f, fy + 1.5f, z ); + const float top = t00 + ax * ( t10 - t00 ); + const float bot = t01 + ax * ( t11 - t01 ); + return top + ay * ( bot - top ); +#else return tex2DLayered( tex, x+0.5f, y+0.5f, z ); +#endif +} + +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) +/* HIP-only: bundle a pyramid array's surface with its per-level width/height. + * + * Observed on gfx90a/CDNA2 (ROCm 7.2.1): LAYERED images are broken. After a layered + * array is written one layer at a time via surf2DLayeredwrite, a read in a later + * kernel launch (tex2DLayered, surf2DLayeredread, and even host hipMemcpy3D) returns + * one single (last-written) layer's data for EVERY layer index -- the layer + * dimension is collapsed. Filed as ROCm/clr#275 (popsift is the motivating case); + * a standalone reproducer mirrors that issue's array3d_check. AMD's partial + * fix ROCm/rocm-systems#6683 corrects only surf2DLayered; tex2DLayered and + * hipMemcpy3D may still collapse the layer dimension independently. A standalone + * repro confirms this AND confirms a non-layered 3D array + * (surf3Dwrite/surf3Dread/tex3D) is fully coherent across launches. So on + * HIP the pyramid arrays are allocated as non-layered 3D arrays (sift_octave.cu) + * and every layered access maps to the 3D form with the layer index as the z + * coordinate. Reads funnel through readTex below and sample the surface via + * surf3Dread; the surface address is a 1:1 match for the surf2DLayeredwrite the + * producer kernels issue (now mapped to surf3Dwrite in cuda_to_hip.h), so the + * blurred value a producer wrote at (x,y,level) is exactly what the consumer reads. + * tex is carried for parity with the CUDA signature but is unused on HIP. + * On CUDA this struct is unused (callers pass the texture object straight to the + * tex2DLayered readTex above) and the read path is byte-for-byte unchanged. + */ +struct LayeredTex +{ + cudaTextureObject_t tex; + cudaSurfaceObject_t surf; + int width; + int height; +}; + +/* Point-fetch one texel from the 3D pyramid surface, clamping the integer coords + * to [0,width-1]x[0,height-1] to reproduce the cudaAddressModeClamp behaviour of + * the point/linear textures these reads replace (an out-of-range surf3Dread + * returns 0 on HIP, which would corrupt image borders). The surface x is a byte + * offset (sizeof(float)), matching the surf2DLayeredwrite call sites (idx*4); z is + * the (integer) blur level. + */ +__device__ static inline +float surfFetchClamped( const LayeredTex& s, int ix, int iy, int layer ) +{ + ix = ix < 0 ? 0 : ( ix >= s.width ? s.width - 1 : ix ); + iy = iy < 0 ? 0 : ( iy >= s.height ? s.height - 1 : iy ); + float v; + surf3Dread( &v, s.surf, ix * 4, iy, layer ); + return v; +} + +__device__ static inline +float readTex( const LayeredTex& s, float x, float y, float z ) +{ + /* Same -0.5 texel-center bilinear convention as the texture readTex above, + * but sampling the coherent 3D surface instead of the broken layered texture. + * SIFT blurs are per-level, so z is always an integer level: interpolate in + * x,y at the fixed z slice (no z interpolation), matching the CUDA layered + * point/linear texture fetch. + */ + const int layer = (int)z; + const float fx = floorf( x ); + const float fy = floorf( y ); + const float ax = x - fx; + const float ay = y - fy; + const int ix = (int)fx; + const int iy = (int)fy; + const float t00 = surfFetchClamped( s, ix, iy, layer ); + const float t10 = surfFetchClamped( s, ix+1, iy, layer ); + const float t01 = surfFetchClamped( s, ix, iy+1, layer ); + const float t11 = surfFetchClamped( s, ix+1, iy+1, layer ); + const float top = t00 + ax * ( t10 - t00 ); + const float bot = t01 + ax * ( t11 - t01 ); + return top + ay * ( bot - top ); } +#endif + +/* Source handle for a pyramid-array layered read funnelled through readTex. + * On HIP it is a LayeredTex (coherent 3D surface + dims); on CUDA it is the plain + * texture object, so kernel signatures and the read path stay identical to upstream + * on CUDA. Build it at the launch site with POPSIFT_LAYERED_SRC, which on CUDA + * expands to exactly the texture argument (no behavioural change) and on HIP packs + * the array's surface and per-level width/height for surf3Dread. + */ +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) +using LayeredReadTex = LayeredTex; +#define POPSIFT_LAYERED_SRC( tex, surf, width, height ) \ + ::popsift::LayeredTex{ (tex), (surf), (width), (height) } +#else +using LayeredReadTex = cudaTextureObject_t; +#define POPSIFT_LAYERED_SRC( tex, surf, width, height ) (tex) +#endif __device__ static inline float readTex( cudaTextureObject_t tex, float x, float y ) diff --git a/src/popsift/common/excl_blk_prefix_sum.h b/src/popsift/common/excl_blk_prefix_sum.h index d77bf7b0..7e37b76d 100644 --- a/src/popsift/common/excl_blk_prefix_sum.h +++ b/src/popsift/common/excl_blk_prefix_sum.h @@ -90,9 +90,17 @@ class Block int ews = 0; // exclusive warp prefix sum int self = (valid) ? _reader.get(cell) : 0; - // This loop is an exclusive prefix sum for one warp + // This loop is an exclusive prefix sum for one warp. The block is + // (32,blockDim.y), one warp per threadIdx.y row. On a 64-lane + // wavefront two rows share a wavefront, so the scan shuffle must be + // confined to a width-32 sub-group (threadIdx.x is the in-row lane id) + // or odd rows pull partial sums from the wrong row. CUDA: unchanged. for( int s=0; s<5; s++ ) { +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + const int add = popsift::shuffle_up( ews+self, 1< other_val ) : ( my_val < other_val ); const bool must_swap = ! ( my_more ^ reverse ^ increasing ); int lane = must_swap ? ( 1 << shift ) : 0; +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + return popsift::shuffle_xor( my_index, lane, 32 ); +#else return popsift::shuffle_xor( my_index, lane ); +#endif } __device__ inline diff --git a/src/popsift/cuda_to_hip.h b/src/popsift/cuda_to_hip.h new file mode 100644 index 00000000..b4a24894 --- /dev/null +++ b/src/popsift/cuda_to_hip.h @@ -0,0 +1,209 @@ +//********************************************************// +// CUDA-to-HIP compatibility shim for PopSift // +// // +// Minimal-footprint port: every other // +// source file keeps its plain CUDA spelling. On AMD this // +// header includes the HIP runtime and #defines the CUDA // +// symbols the project uses to their HIP equivalents. On // +// NVIDIA it is a no-op that pulls in . // +// // +// Symbol names follow PyTorch's authoritative hipify map: // +// torch/utils/hipify/cuda_to_hip_mappings.py // +//********************************************************// + +#ifndef POPSIFT_CUDA_TO_HIP_H +#define POPSIFT_CUDA_TO_HIP_H + +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + +#include +// Backward-compat note (ROCm 7.2.x -> newer ROCm): newer ROCm's +// defines real __shfl_*_sync<...> functions whose names collide +// with the function-like __shfl_*_sync macros defined further below -- if a +// rocThrust/rocprim include pulls that header in AFTER the macros, the macros +// rewrite the header's own definitions and it fails to compile ("use of +// undeclared identifier 'mask'"). Pull the header in FIRST so the real functions +// are defined before the macros. On older ROCm (7.2.x) the header exists and has +// no such functions, so this is a harmless no-op there -- guarded with +// __has_include so it also tolerates any ROCm that ships without the header. +// Only needed where rocThrust/rocprim is pulled in (the device .cu TUs, compiled +// by HIP clang); gate on __clang__ so the plain-C++ host example consumers (which +// may be built by gcc and never include rocThrust) do not try to parse hip_bf16.h, +// whose vector intrinsics require clang builtins. +#if defined(__clang__) && defined(__has_include) +# if __has_include() +# include +# endif +#endif + +// ---- Error handling ---- +#define cudaError_t hipError_t +#define cudaSuccess hipSuccess +#define cudaGetErrorString hipGetErrorString +#define cudaGetLastError hipGetLastError +#define cudaPeekAtLastError hipPeekAtLastError +#define cudaDeviceSynchronize hipDeviceSynchronize +#define cudaDeviceReset hipDeviceReset + +// ---- Device management ---- +#define cudaGetDeviceCount hipGetDeviceCount +#define cudaGetDevice hipGetDevice +#define cudaGetDeviceProperties hipGetDeviceProperties +#define cudaSetDevice hipSetDevice +#define cudaDeviceProp hipDeviceProp_t +#define cudaDeviceSetLimit hipDeviceSetLimit + +// ---- Events / streams ---- +#define cudaEvent_t hipEvent_t +#define cudaEventCreate hipEventCreate +#define cudaEventDestroy hipEventDestroy +#define cudaEventRecord hipEventRecord +#define cudaEventSynchronize hipEventSynchronize +#define cudaEventElapsedTime hipEventElapsedTime +#define cudaStream_t hipStream_t +#define cudaStreamCreate hipStreamCreate +#define cudaStreamDestroy hipStreamDestroy +#define cudaStreamSynchronize hipStreamSynchronize +#define cudaStreamWaitEvent hipStreamWaitEvent + +// ---- Linear / pitched / host memory ---- +#define cudaMalloc hipMalloc +#define cudaMallocManaged hipMallocManaged +#define cudaMallocHost hipHostMalloc +#define cudaFree hipFree +#define cudaFreeHost hipHostFree +#define cudaMallocPitch hipMallocPitch +#define cudaHostRegister hipHostRegister +#define cudaHostUnregister hipHostUnregister +#define cudaHostRegisterPortable hipHostRegisterPortable +#define cudaHostRegisterMapped hipHostRegisterMapped + +#define cudaMemcpy hipMemcpy +#define cudaMemcpyAsync hipMemcpyAsync +#define cudaMemcpy2D hipMemcpy2D +#define cudaMemcpy2DAsync hipMemcpy2DAsync +#define cudaMemset hipMemset +#define cudaMemsetAsync hipMemsetAsync + +// ---- 3D / layered memory (Gaussian pyramid arrays) ---- +#define cudaMemcpy3D hipMemcpy3D +#define cudaMemcpy3DParms hipMemcpy3DParms +#define cudaPitchedPtr hipPitchedPtr +#define cudaExtent hipExtent +#define cudaPos hipPos +#define make_cudaPitchedPtr make_hipPitchedPtr +#define make_cudaExtent make_hipExtent +#define make_cudaPos make_hipPos + +// ---- Constant / symbol memory ---- +#define cudaMemcpyToSymbol hipMemcpyToSymbol +#define cudaMemcpyToSymbolAsync hipMemcpyToSymbolAsync +#define cudaMemcpyFromSymbol hipMemcpyFromSymbol +#define cudaMemcpyFromSymbolAsync hipMemcpyFromSymbolAsync +#define cudaGetSymbolAddress hipGetSymbolAddress + +// ---- memcpy kinds ---- +#define cudaMemcpyKind hipMemcpyKind +#define cudaMemcpyHostToDevice hipMemcpyHostToDevice +#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost +#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice +#define cudaMemcpyHostToHost hipMemcpyHostToHost + +// ---- CUDA arrays + channel format (texture/surface backing store) ---- +#define cudaArray hipArray +#define cudaArray_t hipArray_t +#define cudaMallocArray hipMallocArray +#define cudaMalloc3DArray hipMalloc3DArray +#define cudaFreeArray hipFreeArray +#define cudaArrayLayered hipArrayLayered +#define cudaArraySurfaceLoadStore hipArraySurfaceLoadStore +#define cudaArrayDefault hipArrayDefault +#define cudaChannelFormatDesc hipChannelFormatDesc +#define cudaCreateChannelDesc hipCreateChannelDesc +#define cudaChannelFormatKindFloat hipChannelFormatKindFloat +#define cudaChannelFormatKindUnsigned hipChannelFormatKindUnsigned +#define cudaChannelFormatKindSigned hipChannelFormatKindSigned + +// ---- Texture objects ---- +#define cudaTextureObject_t hipTextureObject_t +#define cudaCreateTextureObject hipCreateTextureObject +#define cudaDestroyTextureObject hipDestroyTextureObject +#define cudaResourceDesc hipResourceDesc +#define cudaTextureDesc hipTextureDesc +#define cudaResourceTypePitch2D hipResourceTypePitch2D +#define cudaResourceTypeArray hipResourceTypeArray +#define cudaResourceTypeLinear hipResourceTypeLinear +#define cudaAddressModeClamp hipAddressModeClamp +#define cudaAddressModeWrap hipAddressModeWrap +#define cudaFilterModeLinear hipFilterModeLinear +#define cudaFilterModePoint hipFilterModePoint +#define cudaReadModeElementType hipReadModeElementType +#define cudaReadModeNormalizedFloat hipReadModeNormalizedFloat +#define cudaTextureType2D hipTextureType2D +#define cudaTextureType2DLayered hipTextureType2DLayered + +// ---- Surface objects (pyramid write path) ---- +#define cudaSurfaceObject_t hipSurfaceObject_t +#define cudaCreateSurfaceObject hipCreateSurfaceObject +#define cudaDestroySurfaceObject hipDestroySurfaceObject +#define cudaBoundaryModeZero hipBoundaryModeZero +#define cudaBoundaryModeClamp hipBoundaryModeClamp +#define cudaBoundaryModeTrap hipBoundaryModeTrap + +// HIP layered-image coherency is broken on gfx90a/CDNA2 (observed ROCm 7.2.1), +// filed as ROCm/clr#275 (the partial fix ROCm/rocm-systems#6683 covers only +// surf2DLayered): after a layered array is written layer-by-layer via +// surf2DLayeredwrite, a read in a later kernel launch (tex2DLayered OR +// surf2DLayeredread, host hipMemcpy3D too) returns a single (last-written) layer's +// data for EVERY layer index -- the layer dimension is effectively collapsed. +// A standalone reproducer confirms it and also +// confirms a NON-layered 3D array (surf3Dwrite/surf3Dread/tex3D) is fully +// coherent across launches. So on HIP the pyramid arrays are allocated as +// non-layered 3D arrays (see sift_octave.cu) and the layered builtins map to the +// 3D ones with the layer index used as the z coordinate (a 1:1 mapping: the +// element write/read addresses are identical). CUDA keeps real layered arrays and +// the native builtins, byte-for-byte unchanged. +// +// HIP's surf3Dwrite has no boundary-mode parameter (CUDA's surf2DLayeredwrite has +// a trailing 6th arg); hipBoundaryModeZero matches the AMD image-store default +// (out-of-range writes dropped), so dropping the argument is faithful. +template +__device__ __forceinline__ void popsift_surf2DLayeredwrite( + T data, hipSurfaceObject_t surfObj, int x, int y, int layer, + int /*boundaryMode*/) +{ + surf3Dwrite(data, surfObj, x, y, layer); +} +#define surf2DLayeredwrite(data, surf, x, y, layer, mode) \ + popsift_surf2DLayeredwrite((data), (surf), (x), (y), (layer), (mode)) + +// ---- Directed-rounding FP intrinsics ---- +// HIP/AMD does not provide the round-toward-+infinity intrinsics. PopSift's only +// uses are descriptor weight accumulation where the operands are non-negative and +// the comment notes _ru is merely a fast round; the round-to-nearest form is the +// faithful HIP equivalent. +#define __fmaf_ru(a, b, c) __fmaf_rn((a), (b), (c)) +#define __fmul_ru(a, b) __fmul_rn((a), (b)) + +// ---- Warp intrinsics ---- +// PopSift (assist.h) passes the CUDA 32-bit full mask 0xffffffff to the *_sync +// builtins. On a 64-lane CDNA wavefront a uint32 mask is meaningless; HIP's +// mask-free builtins poll the whole active wavefront, which is the faithful +// equivalent of "operate over the (sub)warp" on the blocks PopSift launches. +// Map the _sync forms (both the 2-arg and the explicit-width 3/4-arg overloads +// resolve to these) to the mask-free HIP builtins. +#define __shfl_sync(mask, ...) __shfl(__VA_ARGS__) +#define __shfl_up_sync(mask, ...) __shfl_up(__VA_ARGS__) +#define __shfl_down_sync(mask, ...) __shfl_down(__VA_ARGS__) +#define __shfl_xor_sync(mask, ...) __shfl_xor(__VA_ARGS__) +#define __ballot_sync(mask, pred) __ballot(pred) +#define __any_sync(mask, pred) __any(pred) +#define __all_sync(mask, pred) __all(pred) + +#else // NVIDIA / CUDA + +#include + +#endif + +#endif // POPSIFT_CUDA_TO_HIP_H diff --git a/src/popsift/features.cu b/src/popsift/features.cu index 5aa706a1..8f6db5e2 100755 --- a/src/popsift/features.cu +++ b/src/popsift/features.cu @@ -175,11 +175,22 @@ l2_in_t0( const float4* lptr, const float4* rptr ) + mval.y * mval.y + mval.z * mval.z + mval.w * mval.w; + // 32-lane reduction over threadIdx.x (compute_distance is a 32-thread block). + // Force a width-32 sub-group on a 64-lane wavefront so the inactive upper + // half is never sampled. CUDA: width 32 is the whole warp, unchanged. +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + res += shuffle_down( res, 16, 32 ); + res += shuffle_down( res, 8, 32 ); + res += shuffle_down( res, 4, 32 ); + res += shuffle_down( res, 2, 32 ); + res += shuffle_down( res, 1, 32 ); +#else res += shuffle_down( res, 16 ); res += shuffle_down( res, 8 ); res += shuffle_down( res, 4 ); res += shuffle_down( res, 2 ); res += shuffle_down( res, 1 ); +#endif return res; } diff --git a/src/popsift/hip_compat/cuda_runtime.h b/src/popsift/hip_compat/cuda_runtime.h new file mode 100644 index 00000000..6243bd28 --- /dev/null +++ b/src/popsift/hip_compat/cuda_runtime.h @@ -0,0 +1,5 @@ +// HIP-only shim so source that does `#include ` resolves on +// ROCm (where no CUDA headers exist). This directory is added to the include +// path ONLY for the HIP build; the NVIDIA build uses the real CUDA header. +#pragma once +#include "../cuda_to_hip.h" diff --git a/src/popsift/hip_compat/math_constants.h b/src/popsift/hip_compat/math_constants.h new file mode 100644 index 00000000..ffc01ee2 --- /dev/null +++ b/src/popsift/hip_compat/math_constants.h @@ -0,0 +1,16 @@ +// HIP-only shim for CUDA's . ROCm ships no equivalent header, +// so provide the CUDART_* constants PopSift uses, with CUDA's values. On the +// HIP include path only; the NVIDIA build uses the real CUDA header. +#pragma once + +#include + +#ifndef CUDART_INF_F +#define CUDART_INF_F __int_as_float(0x7f800000) +#endif +#ifndef CUDART_NAN_F +#define CUDART_NAN_F __int_as_float(0x7fffffff) +#endif +#ifndef CUDART_PI_F +#define CUDART_PI_F 3.141592654f +#endif diff --git a/src/popsift/s_desc_grid.cu b/src/popsift/s_desc_grid.cu index 099c4709..1e6f25ef 100644 --- a/src/popsift/s_desc_grid.cu +++ b/src/popsift/s_desc_grid.cu @@ -21,7 +21,7 @@ void ext_desc_grid_sub( const int ix, const float ang, const Extremum* ext, float* __restrict__ features, - cudaTextureObject_t layer_tex ) + LayeredReadTex layer_tex ) { const int tile = ( ( ( iy << 2 ) + ix ) << 3 ); // base of the 8 floats written by this group of 16 threads @@ -121,7 +121,7 @@ void ext_desc_grid_sub( const int ix, } } -__global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex) +__global__ void ext_desc_grid(int octave, LayeredReadTex layer_tex) { const int o_offset = dct.ori_ps[octave] + blockIdx.x; const int ix = threadIdx.y; diff --git a/src/popsift/s_desc_grid.h b/src/popsift/s_desc_grid.h index c0919806..63e3a991 100644 --- a/src/popsift/s_desc_grid.h +++ b/src/popsift/s_desc_grid.h @@ -6,6 +6,7 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #pragma once +#include "common/assist.h" #include "common/debug_macros.h" #include "common/plane_2d.h" #include "sift_extremum.h" @@ -17,7 +18,7 @@ * block = 16,4,4 * grid = nunmber of orientations */ -__global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex); +__global__ void ext_desc_grid(int octave, popsift::LayeredReadTex layer_tex); namespace popsift { @@ -39,7 +40,10 @@ inline static bool start_ext_desc_grid( const int octave, Octave& oct_obj ) ext_desc_grid <<>> ( octave, - oct_obj.getDataTexPoint( ) ); + POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ), + oct_obj.getDataSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ) ); POP_SYNC_CHK; diff --git a/src/popsift/s_desc_igrid.cu b/src/popsift/s_desc_igrid.cu index 9f77f12f..eba3c50c 100644 --- a/src/popsift/s_desc_igrid.cu +++ b/src/popsift/s_desc_igrid.cu @@ -20,7 +20,7 @@ void ext_desc_igrid_sub( const float x, const float y, const int level, const float cos_t, const float sin_t, const float SBP, const Extremum* ext, float* __restrict__ features, - cudaTextureObject_t texLinear ) + LayeredReadTex texLinear ) { const int ix = threadIdx.y & 3; const int iy = threadIdx.y / 4; @@ -74,7 +74,7 @@ void ext_desc_igrid_sub( const float x, const float y, const int level, } } -__global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear) +__global__ void ext_desc_igrid(int octave, LayeredReadTex texLinear) { const int num = dct.ori_ct[octave]; diff --git a/src/popsift/s_desc_igrid.h b/src/popsift/s_desc_igrid.h index 8980a4bc..7bed27df 100644 --- a/src/popsift/s_desc_igrid.h +++ b/src/popsift/s_desc_igrid.h @@ -6,6 +6,7 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #pragma once +#include "common/assist.h" #include "common/debug_macros.h" #include "sift_extremum.h" #include "sift_octave.h" @@ -16,7 +17,7 @@ * block = 16,4,4 or with 32,4,4, depending on macros * grid = nunmber of orientations */ -__global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear); +__global__ void ext_desc_igrid(int octave, popsift::LayeredReadTex texLinear); namespace popsift { @@ -40,7 +41,10 @@ inline static bool start_ext_desc_igrid( const int octave, Octave& oct_obj ) ext_desc_igrid <<>> ( octave, - oct_obj.getDataTexLinear( ).tex ); + POPSIFT_LAYERED_SRC( oct_obj.getDataTexLinear( ).tex, + oct_obj.getDataSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ) ); POP_SYNC_CHK; diff --git a/src/popsift/s_desc_iloop.cu b/src/popsift/s_desc_iloop.cu index 84673a20..0d8ed08c 100644 --- a/src/popsift/s_desc_iloop.cu +++ b/src/popsift/s_desc_iloop.cu @@ -19,7 +19,7 @@ __device__ static inline void ext_desc_iloop_sub( const float ang, const Extremum* ext, float* __restrict__ features, - cudaTextureObject_t layer_tex, + LayeredReadTex layer_tex, const int width, const int height ) { @@ -114,6 +114,18 @@ void ext_desc_iloop_sub( const float ang, dpt[0] += dpt[8]; /* reduction here */ + // 32-lane reduction over threadIdx.x; confine the shuffles to a width-32 + // sub-group on a 64-lane wavefront (see s_desc_loop.cu). CUDA unchanged. +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + for (int i = 0; i < 8; i++) { + dpt[i] += popsift::shuffle_down( dpt[i], 16, 32 ); + dpt[i] += popsift::shuffle_down( dpt[i], 8, 32 ); + dpt[i] += popsift::shuffle_down( dpt[i], 4, 32 ); + dpt[i] += popsift::shuffle_down( dpt[i], 2, 32 ); + dpt[i] += popsift::shuffle_down( dpt[i], 1, 32 ); + dpt[i] = popsift::shuffle ( dpt[i], 0, 32 ); + } +#else for (int i = 0; i < 8; i++) { dpt[i] += popsift::shuffle_down( dpt[i], 16 ); dpt[i] += popsift::shuffle_down( dpt[i], 8 ); @@ -122,13 +134,14 @@ void ext_desc_iloop_sub( const float ang, dpt[i] += popsift::shuffle_down( dpt[i], 1 ); dpt[i] = popsift::shuffle ( dpt[i], 0 ); } +#endif if( threadIdx.x < 8 ) { features[tile+threadIdx.x] = dpt[threadIdx.x]; } } -__global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int w, int h) +__global__ void ext_desc_iloop(int octave, LayeredReadTex layer_tex, int w, int h) { const int o_offset = dct.ori_ps[octave] + blockIdx.x; Descriptor* desc = &dbuf.desc [o_offset]; diff --git a/src/popsift/s_desc_iloop.h b/src/popsift/s_desc_iloop.h index e69409b6..03a020ab 100644 --- a/src/popsift/s_desc_iloop.h +++ b/src/popsift/s_desc_iloop.h @@ -6,13 +6,14 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #pragma once +#include "common/assist.h" #include "common/debug_macros.h" #include "common/plane_2d.h" #include "sift_extremum.h" #include "sift_octave.h" #include "sift_pyramid.h" -__global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int width, int height); +__global__ void ext_desc_iloop(int octave, popsift::LayeredReadTex layer_tex, int width, int height); namespace popsift { @@ -34,7 +35,10 @@ inline static bool start_ext_desc_iloop( const int octave, Octave& oct_obj ) ext_desc_iloop <<>> ( octave, - oct_obj.getDataTexLinear( ).tex, + POPSIFT_LAYERED_SRC( oct_obj.getDataTexLinear( ).tex, + oct_obj.getDataSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ), oct_obj.getWidth(), oct_obj.getHeight() ); diff --git a/src/popsift/s_desc_loop.cu b/src/popsift/s_desc_loop.cu index 4c5f46c2..d58a46f9 100644 --- a/src/popsift/s_desc_loop.cu +++ b/src/popsift/s_desc_loop.cu @@ -19,7 +19,7 @@ __device__ static inline void ext_desc_loop_sub( const float ang, const Extremum* ext, float* __restrict__ features, - cudaTextureObject_t layer_tex, + LayeredReadTex layer_tex, const int width, const int height ) { @@ -124,6 +124,21 @@ void ext_desc_loop_sub( const float ang, dpt[0] += dpt[8]; /* reduction here */ + // 32-lane reduction over threadIdx.x. The block is (32,4,4): each (y,z) pair + // is its own 32-thread group, so on a 64-lane wavefront (two groups per + // wavefront) the shuffles must be confined to a width-32 sub-group, else the + // reduction and the lane-0 broadcast leak across the group boundary and + // corrupt half the descriptors (NaN). On CUDA width 32 is the whole warp. +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + for (int i = 0; i < 8; i++) { + dpt[i] += popsift::shuffle_down( dpt[i], 16, 32 ); + dpt[i] += popsift::shuffle_down( dpt[i], 8, 32 ); + dpt[i] += popsift::shuffle_down( dpt[i], 4, 32 ); + dpt[i] += popsift::shuffle_down( dpt[i], 2, 32 ); + dpt[i] += popsift::shuffle_down( dpt[i], 1, 32 ); + dpt[i] = popsift::shuffle ( dpt[i], 0, 32 ); + } +#else for (int i = 0; i < 8; i++) { dpt[i] += popsift::shuffle_down( dpt[i], 16 ); dpt[i] += popsift::shuffle_down( dpt[i], 8 ); @@ -132,13 +147,14 @@ void ext_desc_loop_sub( const float ang, dpt[i] += popsift::shuffle_down( dpt[i], 1 ); dpt[i] = popsift::shuffle ( dpt[i], 0 ); } +#endif if( threadIdx.x < 8 ) { features[tile+threadIdx.x] = dpt[threadIdx.x]; } } -__global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int w, int h) +__global__ void ext_desc_loop(int octave, LayeredReadTex layer_tex, int w, int h) { const int o_offset = dct.ori_ps[octave] + blockIdx.x; Descriptor* desc = &dbuf.desc [o_offset]; diff --git a/src/popsift/s_desc_loop.h b/src/popsift/s_desc_loop.h index 600db498..b7e6084b 100644 --- a/src/popsift/s_desc_loop.h +++ b/src/popsift/s_desc_loop.h @@ -6,6 +6,7 @@ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ #pragma once +#include "common/assist.h" #include "common/debug_macros.h" #include "common/plane_2d.h" #include "sift_extremum.h" @@ -14,7 +15,7 @@ #undef BLOCK_3_DIMS -__global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int width, int height); +__global__ void ext_desc_loop(int octave, popsift::LayeredReadTex layer_tex, int width, int height); namespace popsift { @@ -42,7 +43,10 @@ inline static bool start_ext_desc_loop( const int octave, Octave& oct_obj ) ext_desc_loop <<>> ( octave, - oct_obj.getDataTexPoint( ), + POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ), + oct_obj.getDataSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ), oct_obj.getWidth(), oct_obj.getHeight() ); diff --git a/src/popsift/s_desc_norm_l2.h b/src/popsift/s_desc_norm_l2.h index b067d71f..d1f42b9f 100644 --- a/src/popsift/s_desc_norm_l2.h +++ b/src/popsift/s_desc_norm_l2.h @@ -58,19 +58,41 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool + descr.y * descr.y + descr.z * descr.z + descr.w * descr.w; + // 32-lane reduction over threadIdx.x. The normalize block is (32,32): each + // threadIdx.y row is one descriptor. On a 64-lane wavefront two rows share a + // wavefront, so the reduction and the lane-0 broadcast must stay inside a + // width-32 sub-group, else rows cross-contaminate and the descriptor norm is + // wrong (NaN). CUDA: width 32 is the whole warp, unchanged. +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + norm += popsift::shuffle_down( norm, 16, 32 ); + norm += popsift::shuffle_down( norm, 8, 32 ); + norm += popsift::shuffle_down( norm, 4, 32 ); + norm += popsift::shuffle_down( norm, 2, 32 ); + norm += popsift::shuffle_down( norm, 1, 32 ); +#else norm += popsift::shuffle_down( norm, 16 ); norm += popsift::shuffle_down( norm, 8 ); norm += popsift::shuffle_down( norm, 4 ); norm += popsift::shuffle_down( norm, 2 ); norm += popsift::shuffle_down( norm, 1 ); +#endif if( threadIdx.x == 0 ) { // compute 1 / sqrt(sum) in round-to-nearest even mode in thread 0 norm = __frsqrt_rn( norm ); +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // A fully flat gradient window yields an all-zero descriptor; __frsqrt_rn(0) + // is +inf, and 0*inf below is NaN. Keep such degenerate descriptors all-zero. + if( ! isfinite( norm ) ) norm = 0.0f; +#endif } // spread the inverted norm from thread 0 to all threads in the warp +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + norm = popsift::shuffle( norm, 0, 32 ); +#else norm = popsift::shuffle( norm, 0 ); +#endif // quasi-normalize all 128 floats descr.x = min( descr.x*norm, 0.2f ); @@ -86,18 +108,33 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool + descr.y * descr.y + descr.z * descr.z + descr.w * descr.w; +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + norm += popsift::shuffle_down( norm, 16, 32 ); + norm += popsift::shuffle_down( norm, 8, 32 ); + norm += popsift::shuffle_down( norm, 4, 32 ); + norm += popsift::shuffle_down( norm, 2, 32 ); + norm += popsift::shuffle_down( norm, 1, 32 ); +#else norm += popsift::shuffle_down( norm, 16 ); norm += popsift::shuffle_down( norm, 8 ); norm += popsift::shuffle_down( norm, 4 ); norm += popsift::shuffle_down( norm, 2 ); norm += popsift::shuffle_down( norm, 1 ); +#endif if( threadIdx.x == 0 ) { norm = __frsqrt_rn( norm ); // inverse square root +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + if( ! isfinite( norm ) ) norm = 0.0f; // see above: avoid 0*inf NaN +#endif norm = scalbnf( norm, d_consts.norm_multi ); } +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + norm = popsift::shuffle( norm, 0, 32 ); +#else norm = popsift::shuffle( norm, 0 ); +#endif descr.x = descr.x * norm; descr.y = descr.y * norm; diff --git a/src/popsift/s_desc_norm_rs.h b/src/popsift/s_desc_norm_rs.h index 3ab5b1fc..fd3f7100 100644 --- a/src/popsift/s_desc_norm_rs.h +++ b/src/popsift/s_desc_norm_rs.h @@ -48,6 +48,18 @@ void NormalizeRootSift::normalize( const float* src_desc, float* dst_desc, bool float sum = descr.x + descr.y + descr.z + descr.w; + // 32-lane reduction over threadIdx.x; confine to a width-32 sub-group on a + // 64-lane wavefront (normalize block is (32,32), one descriptor per row). + // CUDA unchanged. +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + sum += popsift::shuffle_down( sum, 16, 32 ); + sum += popsift::shuffle_down( sum, 8, 32 ); + sum += popsift::shuffle_down( sum, 4, 32 ); + sum += popsift::shuffle_down( sum, 2, 32 ); + sum += popsift::shuffle_down( sum, 1, 32 ); + + sum = popsift::shuffle( sum, 0, 32 ); +#else sum += popsift::shuffle_down( sum, 16 ); sum += popsift::shuffle_down( sum, 8 ); sum += popsift::shuffle_down( sum, 4 ); @@ -55,7 +67,28 @@ void NormalizeRootSift::normalize( const float* src_desc, float* dst_desc, bool sum += popsift::shuffle_down( sum, 1 ); sum = popsift::shuffle( sum, 0 ); +#endif +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // RootSift takes sqrt(bin/sum). The fmaxf(.,0) below clamps a bin that came out + // slightly negative from the descriptor accumulation's round-toward-+inf + // intrinsics (mapped to round-to-nearest in cuda_to_hip.h; a negative bin is + // unphysical). The divisor is gated at a small threshold so a degenerate + // near-zero sum is treated as an all-zero descriptor: an all-flat window + // (sum==0) normalizes to 0, and a tiny subnormal sum can no longer make 1/sum + // overflow to +inf (which would normalize a positive bin to +inf). CUDA, where + // the directed-rounding intrinsics exist and bins stay non-negative, is unchanged. + const float inv = ( sum > 1e-20f ) ? __fdividef( 1.0f, sum ) : 0.0f; + float val; + val = scalbnf( __fsqrt_rn( fmaxf( descr.x * inv, 0.0f ) ), d_consts.norm_multi ); + descr.x = val; + val = scalbnf( __fsqrt_rn( fmaxf( descr.y * inv, 0.0f ) ), d_consts.norm_multi ); + descr.y = val; + val = scalbnf( __fsqrt_rn( fmaxf( descr.z * inv, 0.0f ) ), d_consts.norm_multi ); + descr.z = val; + val = scalbnf( __fsqrt_rn( fmaxf( descr.w * inv, 0.0f ) ), d_consts.norm_multi ); + descr.w = val; +#else float val; val = scalbnf( __fsqrt_rn( __fdividef( descr.x, sum ) ), d_consts.norm_multi ); @@ -69,6 +102,7 @@ void NormalizeRootSift::normalize( const float* src_desc, float* dst_desc, bool val = scalbnf( __fsqrt_rn( __fdividef( descr.w, sum ) ), d_consts.norm_multi ); descr.w = val; +#endif if( ! ignoreme ) { float4* out4 = (float4*)dst_desc; diff --git a/src/popsift/s_desc_notile.cu b/src/popsift/s_desc_notile.cu index 9ba8a927..f6fe7789 100644 --- a/src/popsift/s_desc_notile.cu +++ b/src/popsift/s_desc_notile.cu @@ -33,7 +33,7 @@ void ext_desc_notile_sub( const float x, const float y, const int level, const float cos_t, const float sin_t, const float SBP, const Extremum* ext, float* __restrict__ features, - cudaTextureObject_t texLinear ) + LayeredReadTex texLinear ) { float dpt[8] = { 0 }; @@ -97,7 +97,7 @@ __global__ // no -- __launch_bounds__(128) // 63/thread // no -- no launch bound // 64/thread/thread void ext_desc_notile( const int octave, - cudaTextureObject_t texLinear ) + LayeredReadTex texLinear ) { const int num = dct.ori_ct[octave]; @@ -149,7 +149,10 @@ bool start_ext_desc_notile( int octave, Octave& oct_obj ) ext_desc_notile <<>> ( octave, - oct_obj.getDataTexLinear( ).tex ); + POPSIFT_LAYERED_SRC( oct_obj.getDataTexLinear( ).tex, + oct_obj.getDataSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ) ); cudaDeviceSynchronize(); cudaError_t err = cudaGetLastError( ); POP_CUDA_FATAL_TEST(err, "cudaGetLastError failed: "); diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu index 5fc4df13..6509d875 100644 --- a/src/popsift/s_desc_vlfeat.cu +++ b/src/popsift/s_desc_vlfeat.cu @@ -24,7 +24,7 @@ __device__ static inline void ext_desc_vlfeat_sub( const float ang, const Extremum* ext, float* __restrict__ features, - cudaTextureObject_t layer_tex, + LayeredReadTex layer_tex, const int width, const int height ) { @@ -160,7 +160,7 @@ void ext_desc_vlfeat_sub( const float ang, } } -__global__ void ext_desc_vlfeat( int octave, cudaTextureObject_t layer_tex, int w, int h) +__global__ void ext_desc_vlfeat( int octave, LayeredReadTex layer_tex, int w, int h) { const int o_offset = dct.ori_ps[octave] + blockIdx.x; Descriptor* desc = &dbuf.desc [o_offset]; @@ -201,7 +201,10 @@ bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj ) ext_desc_vlfeat <<>> ( octave, - oct_obj.getDataTexPoint( ), + POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ), + oct_obj.getDataSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ), oct_obj.getWidth(), oct_obj.getHeight() ); diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu index e1cf6738..43ef2bbb 100644 --- a/src/popsift/s_extrema.cu +++ b/src/popsift/s_extrema.cu @@ -22,24 +22,45 @@ template __device__ static inline uint32_t extrema_count( unsigned int indicator, int* extrema_counter ) { - uint32_t mask = popsift::ballot( indicator ); // bitfield of warps with results +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // warpSize-generic (wave32 and wave64): one ballot per wavefront, the leader + // (lane 0) does a single atomicAdd of the set-bit count, and each thread's write + // slot is that base plus an exclusive prefix of the set bits below its lane. The + // lane is the block's linear thread index modulo warpSize, so on wave64 two + // blockDim=(32,HEIGHT) rows share a 64-lane wavefront and on wave32 each row is + // its own 32-lane wavefront; either way every row feeds the same octave counter. + // __ballot is 64-bit (unsigned long long); on wave32 the upper 32 bits are 0. + const int lane = ( threadIdx.y * blockDim.x + threadIdx.x ) % warpSize; + const unsigned long long ballot = __ballot( indicator ); + // lane is in [0, warpSize-1] (<= 63), so 1ull< -__device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, +__device__ inline bool find_extrema_in_dog_sub(LayeredReadTex dog, int debug_octave, int width, int height, @@ -504,7 +525,7 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog, template __global__ -void find_extrema_in_dog( cudaTextureObject_t dog, +void find_extrema_in_dog( LayeredReadTex dog, int octave, int width, int height, @@ -584,7 +605,10 @@ void Pyramid::find_extrema( const Config& conf ) case Config::VLFeat : find_extrema_in_dog <<>> - ( oct_obj.getDogTexturePoint( ), + ( POPSIFT_LAYERED_SRC( oct_obj.getDogTexturePoint( ), + oct_obj.getDogSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ), octave, cols, rows, @@ -599,7 +623,10 @@ void Pyramid::find_extrema( const Config& conf ) case Config::OpenCV : find_extrema_in_dog <<>> - ( oct_obj.getDogTexturePoint( ), + ( POPSIFT_LAYERED_SRC( oct_obj.getDogTexturePoint( ), + oct_obj.getDogSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ), octave, cols, rows, @@ -614,7 +641,10 @@ void Pyramid::find_extrema( const Config& conf ) default : find_extrema_in_dog <<>> - ( oct_obj.getDogTexturePoint( ), + ( POPSIFT_LAYERED_SRC( oct_obj.getDogTexturePoint( ), + oct_obj.getDogSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ), octave, cols, rows, diff --git a/src/popsift/s_filtergrid.cu b/src/popsift/s_filtergrid.cu index 9e35d8be..00d1b9eb 100644 --- a/src/popsift/s_filtergrid.cu +++ b/src/popsift/s_filtergrid.cu @@ -24,6 +24,15 @@ #include #include +// The stream-bound parallel execution policy lives in thrust::cuda on NVIDIA +// and thrust::hip on rocThrust. Fully qualified from the global namespace to +// avoid colliding with popsift::cuda (debug_macros.h) inside namespace popsift. +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) +#define POPSIFT_THRUST_PAR ::thrust::hip::par +#else +#define POPSIFT_THRUST_PAR ::thrust::cuda::par +#endif + namespace popsift { @@ -129,12 +138,12 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total ) cudaStream_t oct_str = _octaves[o].getStream(); // fill a continuous device array with octave of all initial extrema - thrust::fill( thrust::cuda::par.on(oct_str), + thrust::fill( POPSIFT_THRUST_PAR.on(oct_str), octave_index.begin() + sum, octave_index.begin() + sum + ocount, o ); // fill a continuous device array with index within octave of all initial extrema - thrust::sequence( thrust::cuda::par.on(oct_str), + thrust::sequence( POPSIFT_THRUST_PAR.on(oct_str), iext_index.begin() + sum, iext_index.begin() + sum + ocount ); sum += ocount; diff --git a/src/popsift/s_gradiant.h b/src/popsift/s_gradiant.h index bfeadbd3..1a3b102f 100644 --- a/src/popsift/s_gradiant.h +++ b/src/popsift/s_gradiant.h @@ -57,7 +57,7 @@ void get_gradiant( float& grad, float& theta, const int x, const int y, - cudaTextureObject_t layer, + LayeredReadTex layer, const int level ) { float dx = readTex( layer, x+1.0f, y, level ) @@ -77,7 +77,7 @@ void get_gradiant32( float& grad, float& theta, const int x, const int y, - cudaTextureObject_t layer, + LayeredReadTex layer, const int level ) { const int idx = threadIdx.x; @@ -106,7 +106,7 @@ void get_gradiant( float& grad, float y, float cos_t, float sin_t, - cudaTextureObject_t texLinear, + LayeredReadTex texLinear, int level ) { float dx = readTex( texLinear, x+cos_t, y+sin_t, level ) diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu index 1f75229f..30b9b53f 100644 --- a/src/popsift/s_orientation.cu +++ b/src/popsift/s_orientation.cu @@ -68,7 +68,7 @@ inline static float smoothe( const float* const src, const int bin ) __global__ void ori_par( const int octave, const int ext_ct_prefix_sum, - cudaTextureObject_t layer, + LayeredReadTex layer, const int w, const int h ) { @@ -114,7 +114,11 @@ void ori_par( const int octave, int loops = wx * hy; __syncthreads(); - for( int i = threadIdx.x; popsift::any(i < loops); i += blockDim.x ) + // ori_par runs as a 32-thread block (one logical warp); restrict the loop + // guard to this thread's own 32-lane group so a 64-lane wavefront does not + // poll a second, unrelated row. Group 0 == the only row here; on CUDA this + // is the plain whole-warp any(). + for( int i = threadIdx.x; popsift::any_group(i < loops, 0); i += blockDim.x ) { if( i < loops ) { int yy = i / wx + ymin; @@ -189,7 +193,7 @@ void ori_par( const int octave, // sub-cell refinement of the histogram cell index, yielding the angle // not necessary to initialize, every cell is computed - for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) { + for( int bin = threadIdx.x; popsift::any_group( bin < ORI_NBINS, 0 ); bin += blockDim.x ) { const int prev = bin == 0 ? ORI_NBINS-1 : bin-1; const int next = bin == ORI_NBINS-1 ? 0 : bin+1; @@ -240,7 +244,7 @@ void ori_par( const int octave, } } - int angles = __popc( popsift::ballot( written ) ); + int angles = __popc( popsift::ballot_group( written, 0 ) ); if( threadIdx.x == 0 ) { ext->xpos = iext->xpos; ext->ypos = iext->ypos; @@ -407,7 +411,10 @@ void Pyramid::orientation( const Config& conf ) <<>> ( octave, hct.ext_ps[octave], - oct_obj.getDataTexPoint( ), + POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ), + oct_obj.getDataSurface( ), + oct_obj.getWidth( ), + oct_obj.getHeight( ) ), oct_obj.getWidth( ), oct_obj.getHeight( ) ); POP_SYNC_CHK; diff --git a/src/popsift/s_pyramid_build.cu b/src/popsift/s_pyramid_build.cu index 7ecf6ab5..629f633d 100755 --- a/src/popsift/s_pyramid_build.cu +++ b/src/popsift/s_pyramid_build.cu @@ -30,7 +30,7 @@ namespace popsift { namespace gauss { __global__ -void get_by_2_pick_every_second( cudaTextureObject_t src_data, +void get_by_2_pick_every_second( LayeredReadTex src_data, const int src_w, const int src_h, const int src_level, @@ -54,7 +54,7 @@ void get_by_2_pick_every_second( cudaTextureObject_t src_data, __global__ -void make_dog( cudaTextureObject_t src_data, +void make_dog( LayeredReadTex src_data, cudaSurfaceObject_t dog_data, const int w, const int h, @@ -198,7 +198,10 @@ inline void Pyramid::downscale_from_prev_octave( int octave, cudaStream_t stream gauss::get_by_2_pick_every_second <<>> - ( prev_oct_obj.getDataTexPoint( ), + ( POPSIFT_LAYERED_SRC( prev_oct_obj.getDataTexPoint( ), + prev_oct_obj.getDataSurface( ), + prev_oct_obj.getWidth(), + prev_oct_obj.getHeight() ), prev_oct_obj.getWidth(), prev_oct_obj.getHeight(), _levels-PREV_LEVEL, @@ -228,7 +231,10 @@ inline void Pyramid::horiz_from_prev_level( int octave, int level, cudaStream_t gauss::absoluteSourceInterpolated::horiz <<>> - ( oct_obj.getDataTexLinear( ).tex, + ( POPSIFT_LAYERED_SRC( oct_obj.getDataTexLinear( ).tex, + oct_obj.getDataSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ), oct_obj.getIntermediateSurface( ), level ); } @@ -244,7 +250,10 @@ inline void Pyramid::horiz_from_prev_level( int octave, int level, cudaStream_t gauss::absoluteSource::horiz <<>> - ( oct_obj.getDataTexPoint( ), + ( POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ), + oct_obj.getDataSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ), oct_obj.getIntermediateSurface( ), level ); } @@ -282,7 +291,9 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea gauss::absoluteSourceInterpolated::vert <<>> - ( oct_obj.getIntermDataTexLinear( ).tex, + ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexLinear( ).tex, + oct_obj.getIntermediateSurface( ), + width, height ), oct_obj.getDataSurface( ), level ); } @@ -296,7 +307,9 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea gauss::absoluteSourceInterpolated::vert_abs0 <<>> - ( oct_obj.getIntermDataTexLinear( ).tex, + ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexLinear( ).tex, + oct_obj.getIntermediateSurface( ), + width, height ), oct_obj.getDataSurface( ), level ); } @@ -310,7 +323,9 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea gauss::absoluteSource::vert <<>> - ( oct_obj.getIntermDataTexPoint( ), + ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexPoint( ), + oct_obj.getIntermediateSurface( ), + width, height ), oct_obj.getDataSurface( ), level ); } @@ -324,7 +339,9 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea gauss::absoluteSource::vert_abs0 <<>> - ( oct_obj.getIntermDataTexPoint( ), + ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexPoint( ), + oct_obj.getIntermediateSurface( ), + width, height ), oct_obj.getDataSurface( ), level ); } @@ -360,7 +377,9 @@ inline void Pyramid::vert_all_from_interm( int octave, int start_level, int max_ gauss::absoluteSourceInterpolated::vert_all_abs0 <<>> - ( oct_obj.getIntermDataTexLinear( ).tex, + ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexLinear( ).tex, + oct_obj.getIntermediateSurface( ), + width, height ), oct_obj.getDataSurface( ), start_level, max_level ); @@ -375,7 +394,9 @@ inline void Pyramid::vert_all_from_interm( int octave, int start_level, int max_ gauss::absoluteSource::vert_all_abs0 <<>> - ( oct_obj.getIntermDataTexPoint( ), + ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexPoint( ), + oct_obj.getIntermediateSurface( ), + width, height ), oct_obj.getDataSurface( ), start_level, max_level ); @@ -408,7 +429,10 @@ inline void Pyramid::dogs_from_blurred( int octave, int max_level, cudaStream_t gauss::make_dog <<>> - ( oct_obj.getDataTexPoint( ), + ( POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ), + oct_obj.getDataSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ), oct_obj.getDogSurface( ), oct_obj.getWidth(), oct_obj.getHeight(), diff --git a/src/popsift/s_pyramid_build_aa.cu b/src/popsift/s_pyramid_build_aa.cu index c026a8b7..1a6edff3 100755 --- a/src/popsift/s_pyramid_build_aa.cu +++ b/src/popsift/s_pyramid_build_aa.cu @@ -14,7 +14,7 @@ namespace popsift { namespace gauss { namespace absoluteSource { -__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) +__global__ void horiz(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) { const int src_level = dst_level - 1; const int span = d_gauss.inc.span[dst_level]; @@ -33,6 +33,26 @@ __global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t g = filter[span]; out += ( A + B ) * g; + // Horizontal Gauss tap exchange via warp shuffles. The block is (32,blockDim.y): + // each threadIdx.y row is an independent 32-lane group convolving one image + // row. On a 64-lane wavefront two rows share a wavefront, so the shuffles must + // be confined to a width-32 sub-group (threadIdx.x is already the in-row lane + // id since blockDim.x==32) or a lane would pull a neighbour from the wrong + // image row and corrupt the pyramid. CUDA: width 32 is the whole warp. +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + int shiftval = 0; + for( int offset=span-1; offset>0; offset-- ) { + shiftval += 1; + const float D1 = popsift::shuffle_down( A, shiftval, 32 ); + const float D2 = popsift::shuffle_up ( C, span - shiftval, 32 ); + const float D = threadIdx.x < (32 - shiftval) ? D1 : D2; + const float E1 = popsift::shuffle_up ( B, shiftval, 32 ); + const float E2 = popsift::shuffle_down( C, span - shiftval, 32 ); + const float E = threadIdx.x > shiftval ? E1 : E2; + g = filter[offset]; + out += ( D + E ) * g; + } +#else int shiftval = 0; for( int offset=span-1; offset>0; offset-- ) { shiftval += 1; @@ -45,11 +65,12 @@ __global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t g = filter[offset]; out += ( D + E ) * g; } +#endif surf2DLayeredwrite( out, dst_data, off_x*4, off_y, dst_level, cudaBoundaryModeZero ); } -__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) +__global__ void vert(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) { const int span = d_gauss.inc.span[dst_level]; const float* filter = &d_gauss.inc.filter[dst_level*GAUSS_ALIGN]; @@ -85,7 +106,7 @@ __global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero ); } -__global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) +__global__ void vert_abs0(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level) { const int span = d_gauss.abs_o0.span[dst_level]; const float* filter = &d_gauss.abs_o0.filter[dst_level*GAUSS_ALIGN]; @@ -121,7 +142,7 @@ __global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObje surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero ); } -__global__ void vert_all_abs0(cudaTextureObject_t src_point_texture, +__global__ void vert_all_abs0(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int start_level, int max_level) diff --git a/src/popsift/s_pyramid_build_aa.h b/src/popsift/s_pyramid_build_aa.h index 4d3423cf..85664f20 100755 --- a/src/popsift/s_pyramid_build_aa.h +++ b/src/popsift/s_pyramid_build_aa.h @@ -5,19 +5,20 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "common/assist.h" #include "common/plane_2d.h" namespace popsift { namespace gauss { namespace absoluteSource { -__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); +__global__ void horiz(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); -__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); +__global__ void vert(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); -__global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); +__global__ void vert_abs0(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level); -__global__ void vert_all_abs0(cudaTextureObject_t src_point_texture, +__global__ void vert_all_abs0(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int start_level, int max_level); diff --git a/src/popsift/s_pyramid_build_ai.cu b/src/popsift/s_pyramid_build_ai.cu index c16d636e..b35ff8d2 100755 --- a/src/popsift/s_pyramid_build_ai.cu +++ b/src/popsift/s_pyramid_build_ai.cu @@ -14,7 +14,7 @@ namespace popsift { namespace gauss { namespace absoluteSourceInterpolated { -__global__ void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level) +__global__ void horiz(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level) { const int src_level = dst_level - 1; const int span = d_gauss.inc.i_span[dst_level]; @@ -40,7 +40,7 @@ __global__ void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t ds surf2DLayeredwrite( out, dst_data, off_x*4, blockIdx.y, dst_level, cudaBoundaryModeZero ); } -__global__ void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level) +__global__ void vert(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level) { const int span = d_gauss.inc.i_span[dst_level]; const float* filter = &d_gauss.inc.i_filter[dst_level*GAUSS_ALIGN]; @@ -68,7 +68,7 @@ __global__ void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst surf2DLayeredwrite( out, dst_data, (block_x+idx)*4, block_y+idy, dst_level, cudaBoundaryModeZero ); } -__global__ void vert_abs0(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level) +__global__ void vert_abs0(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level) { const int span = d_gauss.abs_o0.i_span[dst_level]; const float* filter = &d_gauss.abs_o0.i_filter[dst_level*GAUSS_ALIGN]; @@ -96,7 +96,7 @@ __global__ void vert_abs0(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_ surf2DLayeredwrite( out, dst_data, (block_x+idx)*4, block_y+idy, dst_level, cudaBoundaryModeZero ); } -__global__ void vert_all_abs0(cudaTextureObject_t src_linear_tex, +__global__ void vert_all_abs0(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int start_level, int max_level) diff --git a/src/popsift/s_pyramid_build_ai.h b/src/popsift/s_pyramid_build_ai.h index d3431fe7..7985a787 100755 --- a/src/popsift/s_pyramid_build_ai.h +++ b/src/popsift/s_pyramid_build_ai.h @@ -5,19 +5,20 @@ * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#include "common/assist.h" #include "common/plane_2d.h" namespace popsift { namespace gauss { namespace absoluteSourceInterpolated { -__global__ void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level); +__global__ void horiz(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level); -__global__ void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level); +__global__ void vert(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level); -__global__ void vert_abs0(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level); +__global__ void vert_abs0(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level); -__global__ void vert_all_abs0(cudaTextureObject_t src_linear_tex, +__global__ void vert_all_abs0(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int start_level, int max_level); diff --git a/src/popsift/s_pyramid_fixed.cu b/src/popsift/s_pyramid_fixed.cu index 9e3d52aa..c9af8be2 100755 --- a/src/popsift/s_pyramid_fixed.cu +++ b/src/popsift/s_pyramid_fixed.cu @@ -30,7 +30,20 @@ inline float octave_fixed_horiz( float fval, const float* filter ) * input fval of thread N is extracted from image index N-4 * output fval of thread N should be filtered sum from N-4 to N+4 */ + // Horizontal fixed-span Gauss via warp shuffles. block.x==32 and the block + // packs multiple rows (threadIdx.y/z); on a 64-lane wavefront confine the + // shuffles to a width-32 sub-group so a lane does not pull a neighbour from + // another row. CUDA: width 32 is the whole warp, unchanged. float out = fval * filter[0]; +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + #pragma unroll + for( int i=1; i<=SHIFT; i++ ) { + float val = popsift::shuffle_up( fval, i, 32 ) + popsift::shuffle_down( fval, i, 32 ); + out += val * filter[i]; + } + + fval = popsift::shuffle_down( out, SHIFT, 32 ); +#else #pragma unroll for( int i=1; i<=SHIFT; i++ ) { float val = popsift::shuffle_up( fval, i ) + popsift::shuffle_down( fval, i ); @@ -38,6 +51,7 @@ inline float octave_fixed_horiz( float fval, const float* filter ) } fval = popsift::shuffle_down( out, SHIFT ); +#endif return fval; } @@ -47,7 +61,7 @@ namespace absoluteTexAddress { template __device__ -inline float octave_fixed_vert( cudaTextureObject_t src_data, int idx, int idy, int level, const float* filter ) +inline float octave_fixed_vert( LayeredReadTex src_data, int idx, int idy, int level, const float* filter ) { /* Input thread N takes as input the (idx,idy) position of the pixel that it * will eventually write (The 2*SHIFT rightmost threads will not write anything). @@ -68,7 +82,7 @@ inline float octave_fixed_vert( cudaTextureObject_t src_data, int idx, int idy, template __global__ -void octave_fixed( cudaTextureObject_t src_data, +void octave_fixed( LayeredReadTex src_data, cudaSurfaceObject_t dst_data, const int w, const int h, @@ -258,7 +272,10 @@ inline void make_octave_sub( const Config& conf, ImageBase* base, Octave& oct_ob gauss::fixedSpan::absoluteTexAddress::octave_fixed <<>> - ( oct_obj.getDataTexPoint( ), + ( POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ), + oct_obj.getDataSurface( ), + oct_obj.getWidth(), + oct_obj.getHeight() ), oct_obj.getDataSurface( ), oct_obj.getWidth(), oct_obj.getHeight(), diff --git a/src/popsift/sift_octave.cu b/src/popsift/sift_octave.cu index 4e05780d..4ec5c908 100755 --- a/src/popsift/sift_octave.cu +++ b/src/popsift/sift_octave.cu @@ -221,10 +221,21 @@ void Octave::alloc_data_planes() _data_ext.height = _h; _data_ext.depth = _levels; + // Observed on gfx90a (ROCm 7.2.1): layered images are incoherent across kernel + // launches (the layer dimension collapses to a single layer on read); filed as + // ROCm/clr#275 (the partial fix ROCm/rocm-systems#6683 covers only surf2DLayered). + // A non-layered 3D array with surf3D/tex3D access is coherent, so drop + // cudaArrayLayered on HIP. The blur levels are addressed by the z coordinate + // instead of the layer index. See cuda_to_hip.h and common/assist.h. CUDA keeps + // a real layered array. +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + err = cudaMalloc3DArray( &_data, &_data_desc, _data_ext, cudaArraySurfaceLoadStore ); +#else err = cudaMalloc3DArray( &_data, &_data_desc, _data_ext, cudaArrayLayered | cudaArraySurfaceLoadStore); +#endif POP_CUDA_FATAL_TEST(err, "Could not allocate Blur level array: "); } @@ -266,7 +277,14 @@ void Octave::alloc_data_tex() tex_desc.addressMode[1] = cudaAddressModeClamp; tex_desc.addressMode[2] = cudaAddressModeClamp; tex_desc.readMode = cudaReadModeElementType; // read as float - tex_desc.filterMode = cudaFilterModeLinear; // no interpolation + tex_desc.filterMode = cudaFilterModeLinear; // hardware bilinear (CUDA) +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // Observed on gfx90a (ROCm 7.2.1): hipCreateTextureObject rejects hardware + // linear filtering on element-read float arrays ("operation not supported"). + // Create the texture with point filtering and do bilinear interpolation in + // software in readTex(). Empirical on this device/ROCm; re-verify on RDNA. + tex_desc.filterMode = cudaFilterModePoint; +#endif err = cudaCreateTextureObject( &_data_tex_linear.tex, &res_desc, &tex_desc, nullptr ); POP_CUDA_FATAL_TEST(err, "Could not create Blur data point texture: "); @@ -300,10 +318,15 @@ void Octave::alloc_interm_array() _intm_ext.height = _h; _intm_ext.depth = _levels; + // Non-layered 3D array on HIP (see alloc_data_planes()). +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + err = cudaMalloc3DArray( &_intm, &_intm_desc, _intm_ext, cudaArraySurfaceLoadStore ); +#else err = cudaMalloc3DArray( &_intm, &_intm_desc, _intm_ext, cudaArrayLayered | cudaArraySurfaceLoadStore); +#endif POP_CUDA_FATAL_TEST(err, "Could not allocate Intermediate layered array: "); } @@ -339,7 +362,12 @@ void Octave::alloc_interm_tex() err = cudaCreateTextureObject( &_intm_tex_point, &res_desc, &tex_desc, nullptr ); POP_CUDA_FATAL_TEST(err, "Could not create Blur intermediate point texture: "); - tex_desc.filterMode = cudaFilterModeLinear; // no interpolation + tex_desc.filterMode = cudaFilterModeLinear; // hardware bilinear (CUDA) +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + // See alloc_data_tex(): HIP rejects linear filtering on element-read float + // arrays, so use point filtering here and interpolate in software in readTex(). + tex_desc.filterMode = cudaFilterModePoint; +#endif err = cudaCreateTextureObject( &_intm_tex_linear.tex, &res_desc, &tex_desc, nullptr ); POP_CUDA_FATAL_TEST(err, "Could not create Blur intermediate point texture: "); @@ -373,10 +401,15 @@ void Octave::alloc_dog_array() _dog_3d_ext.height = _h; _dog_3d_ext.depth = _levels - 1; + // Non-layered 3D array on HIP (see alloc_data_planes()). +#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__) + err = cudaMalloc3DArray(&_dog_3d, &_dog_3d_desc, _dog_3d_ext, cudaArraySurfaceLoadStore); +#else err = cudaMalloc3DArray(&_dog_3d, &_dog_3d_desc, _dog_3d_ext, cudaArrayLayered | cudaArraySurfaceLoadStore); +#endif POP_CUDA_FATAL_TEST(err, "Could not allocate 3D DoG array: "); } diff --git a/src/popsift/sift_octave.h b/src/popsift/sift_octave.h index 96ea7635..ead94c02 100755 --- a/src/popsift/sift_octave.h +++ b/src/popsift/sift_octave.h @@ -19,7 +19,11 @@ namespace popsift { struct LinearTexture { - cudaSurfaceObject_t tex; + // Holds a texture object (assigned from cudaCreateTextureObject and consumed + // by tex2DLayered). On CUDA texture and surface handles are both unsigned + // long long so the original cudaSurfaceObject_t typing compiled; HIP uses + // distinct pointer types, so the correct texture type must be used here. + cudaTextureObject_t tex; }; class Octave