alicevision · jeffdaily · Jun 3, 2026 · Jun 3, 2026 · Jun 8, 2026 · Jun 19, 2026
diff --git a/.gitignore b/.gitignore
@@ -47,3 +47,5 @@ oxford
 
 # Downloaded archives for tests.
 *.tgz
+
+build-hip/
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,26 +1,38 @@
 # First-order language CUDA requires at least CMake 3.18
 cmake_minimum_required(VERSION 3.24)
 
-# Default seletion of CUDA Compute Capabilities.
-# This must be called before project() or cmake sets it to the oldest non-deprecated CC
-# "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson.
-if(EXISTS "/etc/nv_tegra_release")
-  # The CC list for Tegras and Jetson will require manual updates
-  set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87"
-      CACHE
-      STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
-else()
-  # The CC list for discrete GPUs will require CMake updates
-  set(CMAKE_CUDA_ARCHITECTURES "all-major"
-      CACHE
-      STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
+# USE_HIP routes the .cu translation units through the HIP toolchain for AMD
+# GPUs. Everything else keeps its plain
+# CUDA spelling; cuda_to_hip.h (force-included for HIP in src/CMakeLists.txt)
+# does the symbol translation. The NVIDIA path (USE_HIP=OFF) is unchanged.
+option(USE_HIP "Build with HIP for AMD GPUs instead of CUDA" OFF)
+
+if(NOT USE_HIP)
+  # Default seletion of CUDA Compute Capabilities.
+  # This must be called before project() or cmake sets it to the oldest non-deprecated CC
+  # "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson.
+  if(EXISTS "/etc/nv_tegra_release")
+    # The CC list for Tegras and Jetson will require manual updates
+    set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87"
+        CACHE
+        STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
+  else()
+    # The CC list for discrete GPUs will require CMake updates
+    set(CMAKE_CUDA_ARCHITECTURES "all-major"
+        CACHE
+        STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
+  endif()
 endif()
 
-project(PopSift VERSION 0.10.1 LANGUAGES CXX CUDA)
+if(USE_HIP)
+  project(PopSift VERSION 0.10.1 LANGUAGES CXX HIP)
+else()
+  project(PopSift VERSION 0.10.1 LANGUAGES CXX CUDA)
 
-# Policy to support CUDA as a first-order language for CMake.
-# Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html
-cmake_policy(SET CMP0104 NEW)
+  # Policy to support CUDA as a first-order language for CMake.
+  # Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html
+  cmake_policy(SET CMP0104 NEW)
+endif()
 
 # Set build path as a folder named as the platform (linux, windows, darwin...) plus the processor type
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
@@ -75,70 +87,94 @@ if(BUILD_SHARED_LIBS)
   # Auto-build dll exports on Windows
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
-  set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
+  if(NOT USE_HIP)
+    set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
+  endif()
 else()
   message(STATUS "BUILD_SHARED_LIBS OFF")
 
-  set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
+  if(NOT USE_HIP)
+    set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
+  endif()
 endif()
 
 # Require threads because of std::thread.
 find_package(Threads REQUIRED)
 
-###################
-#  CUDA
-###################
-include(CheckLanguage)
-check_language(CUDA)
-
-# Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path"
-# target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart)
-find_package(CUDAToolkit)
-
-message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}")
-set(CUDA_VERSION ${CUDAToolkit_VERSION})
-
-if(PopSift_ERRCHK_AFTER_KERNEL)
-  message(STATUS "Synchronizing and checking errors after every kernel call")
-  list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL")
-endif()
-
-# This may not be required any more.
-set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-
-# default stream per-thread implies that each host thread has one non-synchronizing 0-stream
-# currently, the code requires legacy mode
-list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy")
-# set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};--default-stream;per-thread")
+if(USE_HIP)
+  ###################
+  #  HIP (AMD)
+  ###################
+  # rocThrust supplies the thrust::cuda::par compatibility namespace, so the
+  # Thrust code in s_filtergrid.cu compiles unchanged on ROCm.
+  find_package(rocthrust REQUIRED CONFIG)
+
+  set(PopSift_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+  set(CMAKE_HIP_STANDARD ${PopSift_CXX_STANDARD})
+  set(CMAKE_HIP_STANDARD_REQUIRED ON)
+  set(CMAKE_HIP_SEPARABLE_COMPILATION ON)
+
+  # HIP has __shfl_down etc. unconditionally; assist.h selects the _sync
+  # spelling on this flag and cuda_to_hip.h maps _sync to the mask-free builtin.
+  set(PopSift_HAVE_SHFL_DOWN_SYNC   1)
+else()
+  ###################
+  #  CUDA
+  ###################
+  include(CheckLanguage)
+  check_language(CUDA)
+
+  # Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path"
+  # target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart)
+  find_package(CUDAToolkit)
+
+  message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}")
+  set(CUDA_VERSION ${CUDAToolkit_VERSION})
+
+  if(PopSift_ERRCHK_AFTER_KERNEL)
+    message(STATUS "Synchronizing and checking errors after every kernel call")
+    list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL")
+  endif()
 
-if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
-  if(PopSift_NVCC_WARNINGS)
-    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage")
-    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills")
-    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage")
-    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills")
+  # This may not be required any more.
+  set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
+  # default stream per-thread implies that each host thread has one non-synchronizing 0-stream
+  # currently, the code requires legacy mode
+  list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy")
+  # set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};--default-stream;per-thread")
+
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
+    if(PopSift_NVCC_WARNINGS)
+      list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage")
+      list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills")
+      list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage")
+      list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills")
+    endif()
   endif()
-endif()
 
-set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11
-if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
-  set(PopSift_CXX_STANDARD 11)
-endif()
+  set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11
+  if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
+    set(PopSift_CXX_STANDARD 11)
+  endif()
 
-if(NOT MSVC)
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}")
-      list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}")
-endif()
-set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+  if(NOT MSVC)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}")
+        list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}")
+  endif()
+  set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+  set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
+  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 
-if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
-  set(PopSift_HAVE_SHFL_DOWN_SYNC   1)
-else()
-  set(PopSift_HAVE_SHFL_DOWN_SYNC   0)
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
+    set(PopSift_HAVE_SHFL_DOWN_SYNC   1)
+  else()
+    set(PopSift_HAVE_SHFL_DOWN_SYNC   0)
+  endif()
 endif()
 
 if(NOT PopSift_USE_GRID_FILTER)

diff --git a/README.md b/README.md
@@ -15,6 +15,8 @@ PopSift compiles and works with NVidia cards of compute capability >= 3.0 (inclu
 
 CUDA SDK 11 does no longer support compute capability 3.0. 3.5 is still supported with deprecation warnings.
 
+PopSift also runs on AMD GPUs through ROCm/HIP (validated on gfx90a and gfx1100, and on Windows gfx1151); see the `USE_HIP` build option below.
+
 ## Dependencies
 
 PopSift depends on:
@@ -46,6 +48,42 @@ Some build options are available:
 
 * `BUILD_SHARED_LIBS` (default: `ON`) controls the type of library to build (`ON` for shared libraries, `OFF` for static)
 
+* `USE_HIP` (default: `OFF`) builds for AMD GPUs through ROCm/HIP instead of CUDA (requires a ROCm installation); see "Building for AMD GPUs (ROCm/HIP)" below.
+
+### Building for AMD GPUs (ROCm/HIP)
+
+With a ROCm installation, configure with `-DUSE_HIP=ON` to build the GPU code through HIP instead of CUDA. The CUDA build path is unchanged when `USE_HIP=OFF` (the default).
+
+```shell
+cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a \
+  -DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
+  -DCMAKE_PREFIX_PATH=/opt/rocm
+make
+```
+
+Point `-DCMAKE_PREFIX_PATH` at the ROCm install prefix (`/opt/rocm` for a default install) so CMake can locate the `hip` and `rocThrust` packages when ROCm is not already on the search path.
+
+Set the target architecture with `-DCMAKE_HIP_ARCHITECTURES` (e.g. `gfx90a`, `gfx1100`); pass a semicolon-separated list such as `"gfx90a;gfx1100"` to emit a fat binary.
+
+#### Windows (gfx1151 / TheRock ROCm)
+
+Tested with the ROCm wheels from [TheRock](https://github.com/ROCm/TheRock) (see its README for installing the Windows wheels). The steps below assume `<rocm>` is that install prefix and that `clang` from it is on `PATH`:
+
+```shell
+cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1151 ^
+  -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang -DCMAKE_HIP_COMPILER=clang ^
+  -DCMAKE_PREFIX_PATH=<rocm> -DPopSift_BUILD_EXAMPLES=OFF ^
+  -DCMAKE_LINKER_TYPE=LLDFIX ^
+  -DCMAKE_C_USING_LINKER_LLDFIX=-fuse-ld=lld ^
+  -DCMAKE_CXX_USING_LINKER_LLDFIX=-fuse-ld=lld ^
+  -DCMAKE_HIP_USING_LINKER_LLDFIX=-fuse-ld=lld
+cmake --build . --config Release
+```
+
+The `CMAKE_LINKER_TYPE=LLDFIX` block works around the `-fgpu-rdc` device link: CMake otherwise emits `-fuse-ld=lld-link`, which AMD clang rejects under `--hip-link`. At runtime, copy a matching `amdhip64` and `amd_comgr` from `<rocm>` next to the executable (a System32 Adrenalin-driver runtime can be device-library mismatched).
+
+On the gfx1151 APU, `maxTexture2DLayered` is smaller than on gfx90a, so the default upscale can overflow for large images; use native-resolution downsampling (`Config::setDownsampling(0)`) or cap the input size.
+
 ## Usage
 
 The main artifact created is `libpopsift`.

diff --git a/doc/sphinx/source/install/install.rst b/doc/sphinx/source/install/install.rst
@@ -33,6 +33,8 @@ Software
 
 The core library depends only on Cuda >= 7.0
 
+PopSift also builds for AMD GPUs through ROCm/HIP instead of CUDA, requiring ROCm 7.2 or newer; see the ``USE_HIP`` CMake option below.
+
 The library includes a few sample applications that show how to use the library.
 They require
 
@@ -173,6 +175,8 @@ CMake configuration can be controlled by changing the values of the following va
 
 * :code:`PopSift_BUILD_DOC:BOOL=OFF` to enable/disable building this documentation and the Doxygen one.
 
+* :code:`USE_HIP:BOOL=OFF` to build for AMD GPUs with ROCm/HIP instead of CUDA. Set the target GPU architecture with :code:`CMAKE_HIP_ARCHITECTURES` (for example :code:`gfx90a` for CDNA2 / MI200, or :code:`gfx1100` for RDNA3). Requires a ROCm installation (7.2 or newer) providing HIP.
+
 For example, if you do not want to build the applications, you have to pass :code:`-DPopSift_BUILD_EXAMPLES:BOOL=OFF` and so on.
 
 

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -41,14 +41,71 @@ add_library(popsift
         popsift/common/vec_macros.h
         popsift/common/clamp.h)
 
+if(USE_HIP)
+  # Collect the .cu translation units so the HIP build can mark them LANGUAGE HIP.
+  # Under USE_HIP, .cu would otherwise default to CUDA (a language not enabled in
+  # the HIP build), so the language must be set explicitly. This is only needed on
+  # the HIP path, so the collection lives inside the guard.
+  get_target_property(_popsift_sources popsift SOURCES)
+  set(_popsift_cu_sources "")
+  foreach(_src ${_popsift_sources})
+    if(_src MATCHES "\\.cu$")
+      list(APPEND _popsift_cu_sources "${_src}")
+    endif()
+  endforeach()
+  set_source_files_properties(${_popsift_cu_sources} PROPERTIES LANGUAGE HIP)
+endif()
+
 target_link_libraries(popsift
                       PUBLIC
-		      CUDA::cudart
 		      Threads::Threads)
+if(USE_HIP)
+  # rocThrust transitively links hip::device, whose INTERFACE_COMPILE_OPTIONS inject
+  # "-x hip" for COMPILE_LANGUAGE:CXX. Keep that device-compile interface PRIVATE so
+  # it does not leak "-x hip" onto plain-C++ consumers of popsift (the example apps).
+  # Consumers still need the HIP runtime to link and its headers to satisfy the
+  # public <cuda_runtime.h> include, so expose hip::host PUBLIC (link + headers, no
+  # "-x hip"). The library .cu sources are already LANGUAGE HIP, so they get the HIP
+  # compiler regardless of the link visibility.
+  target_link_libraries(popsift PRIVATE roc::rocthrust)
+  target_link_libraries(popsift PUBLIC hip::host)
+else()
+  target_link_libraries(popsift PUBLIC CUDA::cudart)
+endif()
 
 set_target_properties(popsift PROPERTIES VERSION ${PROJECT_VERSION})
 set_target_properties(popsift PROPERTIES DEBUG_POSTFIX "d")
-set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+if(USE_HIP)
+  # CMAKE_HIP_ARCHITECTURES is already resolved by project(... LANGUAGES CXX HIP)
+  # in the top-level CMakeLists: it auto-detects the host GPU arch, honors an
+  # explicit -DCMAKE_HIP_ARCHITECTURES, and errors on a host with no GPU. Pass
+  # -DCMAKE_HIP_ARCHITECTURES to override.
+  set_target_properties(popsift PROPERTIES
+    HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
+  # USE_HIP selects the HIP branch of cuda_to_hip.h. __HIP_PLATFORM_AMD__ is
+  # only defined once the HIP runtime header is included, so it cannot gate the
+  # include of that very header; the explicit define breaks the cycle.
+  target_compile_definitions(popsift PRIVATE USE_HIP)
+  # Force-include the compat header into every HIP translation unit so no
+  # per-file #include edits are needed (minimal footprint).
+  # -fgpu-rdc keeps device code relocatable so the cross-TU __device__/__constant__
+  # globals (d_consts, d_gauss, dct/dbuf/dobuf -- declared extern in headers,
+  # defined in one .cu, used in many) are resolved by the final --hip-link step.
+  # Without it each .o is device-linked standalone and those references are
+  # undefined. The matching -fgpu-rdc on the link rule (below) performs the
+  # device link.
+  target_compile_options(popsift PRIVATE
+    $<$<COMPILE_LANGUAGE:HIP>:-include${CMAKE_CURRENT_SOURCE_DIR}/popsift/cuda_to_hip.h>
+    $<$<COMPILE_LANGUAGE:HIP>:-fgpu-rdc>)
+  target_link_options(popsift PRIVATE -fgpu-rdc --hip-link)
+  # hip_compat/ supplies a <cuda_runtime.h> shim for the sources that include it
+  # directly; HIP build only. PRIVATE so the public popsift.h <cuda_runtime.h>
+  # include still resolves to the real CUDA header for downstream NVIDIA users.
+  target_include_directories(popsift PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/popsift/hip_compat)
+else()
+  set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+endif()
 if(MSVC)
         target_compile_options(popsift PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/Zc:preprocessor>)
 endif()

diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt
@@ -70,7 +70,24 @@ endif()
 
 add_executable(popsift-demo  main.cpp pgmread.cpp pgmread.h)
 
-set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11)
+if(USE_HIP)
+  # main.cpp/pgmread.cpp have NO device code; they consume popsift's host API only.
+  # popsift now links the HIP device-compile interface (roc::rocthrust -> hip::device,
+  # which injects "-x hip") PRIVATE and exposes only hip::host PUBLIC, so these
+  # sources compile as plain C++ (no "-x hip") and just link the HIP runtime. They
+  # still need the hip_compat <cuda_runtime.h> shim and the cuda_to_hip.h force-include
+  # because popsift's public headers (device_prop.h) #include <cuda_runtime.h>; the
+  # library keeps those PRIVATE, so the consumer adds them explicitly here. USE_HIP
+  # selects the HIP branch of cuda_to_hip.h. C++17 for the compat header.
+  set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 17)
+  target_compile_definitions(popsift-demo PRIVATE USE_HIP)
+  target_include_directories(popsift-demo PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../popsift/hip_compat)
+  target_compile_options(popsift-demo PRIVATE
+    $<$<COMPILE_LANGUAGE:CXX>:-include${CMAKE_CURRENT_SOURCE_DIR}/../popsift/cuda_to_hip.h>)
+else()
+  set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11)
+endif()
 
 target_compile_options(popsift-demo PRIVATE ${PD_COMPILE_OPTIONS} )
 target_include_directories(popsift-demo PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
@@ -84,7 +101,17 @@ target_link_libraries(popsift-demo PUBLIC PopSift::popsift ${PD_LINK_LIBS})
 
 add_executable(popsift-match match.cpp pgmread.cpp pgmread.h)
 
-set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11)
+if(USE_HIP)
+  # See popsift-demo above: plain-C++ host consumer, no device code, no "-x hip".
+  set_property(TARGET popsift-match PROPERTY CXX_STANDARD 17)
+  target_compile_definitions(popsift-match PRIVATE USE_HIP)
+  target_include_directories(popsift-match PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../popsift/hip_compat)
+  target_compile_options(popsift-match PRIVATE
+    $<$<COMPILE_LANGUAGE:CXX>:-include${CMAKE_CURRENT_SOURCE_DIR}/../popsift/cuda_to_hip.h>)
+else()
+  set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11)
+endif()
 
 target_compile_options(popsift-match PRIVATE ${PD_COMPILE_OPTIONS} )
 target_include_directories(popsift-match PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
Original file line number	Diff line number	Diff line change
Expand Up		@@ -47,3 +47,5 @@ oxford

		# Downloaded archives for tests.
		*.tgz

		build-hip/