diff --git a/.gitignore b/.gitignore
index c44a8393..77f2eee9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,3 +47,5 @@ oxford
 
 # Downloaded archives for tests.
 *.tgz
+
+build-hip/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a6a7ad64..07bda8ac 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,26 +1,38 @@
 # First-order language CUDA requires at least CMake 3.18
 cmake_minimum_required(VERSION 3.24)
 
-# Default seletion of CUDA Compute Capabilities.
-# This must be called before project() or cmake sets it to the oldest non-deprecated CC
-# "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson.
-if(EXISTS "/etc/nv_tegra_release")
-  # The CC list for Tegras and Jetson will require manual updates
-  set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87"
-      CACHE
-      STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
-else()
-  # The CC list for discrete GPUs will require CMake updates
-  set(CMAKE_CUDA_ARCHITECTURES "all-major"
-      CACHE
-      STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
+# USE_HIP routes the .cu translation units through the HIP toolchain for AMD
+# GPUs. Everything else keeps its plain
+# CUDA spelling; cuda_to_hip.h (force-included for HIP in src/CMakeLists.txt)
+# does the symbol translation. The NVIDIA path (USE_HIP=OFF) is unchanged.
+option(USE_HIP "Build with HIP for AMD GPUs instead of CUDA" OFF)
+
+if(NOT USE_HIP)
+  # Default seletion of CUDA Compute Capabilities.
+  # This must be called before project() or cmake sets it to the oldest non-deprecated CC
+  # "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson.
+  if(EXISTS "/etc/nv_tegra_release")
+    # The CC list for Tegras and Jetson will require manual updates
+    set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87"
+        CACHE
+        STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
+  else()
+    # The CC list for discrete GPUs will require CMake updates
+    set(CMAKE_CUDA_ARCHITECTURES "all-major"
+        CACHE
+        STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
+  endif()
 endif()
 
-project(PopSift VERSION 0.10.1 LANGUAGES CXX CUDA)
+if(USE_HIP)
+  project(PopSift VERSION 0.10.1 LANGUAGES CXX HIP)
+else()
+  project(PopSift VERSION 0.10.1 LANGUAGES CXX CUDA)
 
-# Policy to support CUDA as a first-order language for CMake.
-# Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html
-cmake_policy(SET CMP0104 NEW)
+  # Policy to support CUDA as a first-order language for CMake.
+  # Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html
+  cmake_policy(SET CMP0104 NEW)
+endif()
 
 # Set build path as a folder named as the platform (linux, windows, darwin...) plus the processor type
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
@@ -75,70 +87,94 @@ if(BUILD_SHARED_LIBS)
   # Auto-build dll exports on Windows
   set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 
-  set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
+  if(NOT USE_HIP)
+    set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
+  endif()
 else()
   message(STATUS "BUILD_SHARED_LIBS OFF")
 
-  set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
+  if(NOT USE_HIP)
+    set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
+  endif()
 endif()
 
 # Require threads because of std::thread.
 find_package(Threads REQUIRED)
 
-###################
-#  CUDA
-###################
-include(CheckLanguage)
-check_language(CUDA)
-
-# Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path"
-# target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart)
-find_package(CUDAToolkit)
-
-message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}")
-set(CUDA_VERSION ${CUDAToolkit_VERSION})
-
-if(PopSift_ERRCHK_AFTER_KERNEL)
-  message(STATUS "Synchronizing and checking errors after every kernel call")
-  list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL")
-endif()
-
-# This may not be required any more.
-set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
-
-# default stream per-thread implies that each host thread has one non-synchronizing 0-stream
-# currently, the code requires legacy mode
-list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy")
-# set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};--default-stream;per-thread")
+if(USE_HIP)
+  ###################
+  #  HIP (AMD)
+  ###################
+  # rocThrust supplies the thrust::cuda::par compatibility namespace, so the
+  # Thrust code in s_filtergrid.cu compiles unchanged on ROCm.
+  find_package(rocthrust REQUIRED CONFIG)
+
+  set(PopSift_CXX_STANDARD 17)
+  set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+  set(CMAKE_HIP_STANDARD ${PopSift_CXX_STANDARD})
+  set(CMAKE_HIP_STANDARD_REQUIRED ON)
+  set(CMAKE_HIP_SEPARABLE_COMPILATION ON)
+
+  # HIP has __shfl_down etc. unconditionally; assist.h selects the _sync
+  # spelling on this flag and cuda_to_hip.h maps _sync to the mask-free builtin.
+  set(PopSift_HAVE_SHFL_DOWN_SYNC   1)
+else()
+  ###################
+  #  CUDA
+  ###################
+  include(CheckLanguage)
+  check_language(CUDA)
+
+  # Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path"
+  # target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart)
+  find_package(CUDAToolkit)
+
+  message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}")
+  set(CUDA_VERSION ${CUDAToolkit_VERSION})
+
+  if(PopSift_ERRCHK_AFTER_KERNEL)
+    message(STATUS "Synchronizing and checking errors after every kernel call")
+    list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL")
+  endif()
 
-if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
-  if(PopSift_NVCC_WARNINGS)
-    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage")
-    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills")
-    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage")
-    list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills")
+  # This may not be required any more.
+  set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
+
+  # default stream per-thread implies that each host thread has one non-synchronizing 0-stream
+  # currently, the code requires legacy mode
+  list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy")
+  # set(CUDA_NVCC_FLAGS         "${CUDA_NVCC_FLAGS};--default-stream;per-thread")
+
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
+    if(PopSift_NVCC_WARNINGS)
+      list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage")
+      list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills")
+      list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage")
+      list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills")
+    endif()
   endif()
-endif()
 
-set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11
-if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
-  set(PopSift_CXX_STANDARD 11)
-endif()
+  set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11
+  if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
+    set(PopSift_CXX_STANDARD 11)
+  endif()
 
-if(NOT MSVC)
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}")
-      list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}")
-endif()
-set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
-set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+  if(NOT MSVC)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}")
+        list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}")
+  endif()
+  set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
+  set(CMAKE_CXX_STANDARD_REQUIRED ON)
+  set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
+  set(CMAKE_CUDA_STANDARD_REQUIRED ON)
 
 
-if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
-  set(PopSift_HAVE_SHFL_DOWN_SYNC   1)
-else()
-  set(PopSift_HAVE_SHFL_DOWN_SYNC   0)
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
+    set(PopSift_HAVE_SHFL_DOWN_SYNC   1)
+  else()
+    set(PopSift_HAVE_SHFL_DOWN_SYNC   0)
+  endif()
 endif()
 
 if(NOT PopSift_USE_GRID_FILTER)
diff --git a/README.md b/README.md
index bcab5120..c368710e 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,8 @@ PopSift compiles and works with NVidia cards of compute capability >= 3.0 (inclu
 
 CUDA SDK 11 does no longer support compute capability 3.0. 3.5 is still supported with deprecation warnings.
 
+PopSift also runs on AMD GPUs through ROCm/HIP (validated on gfx90a and gfx1100, and on Windows gfx1151); see the `USE_HIP` build option below.
+
 ## Dependencies
 
 PopSift depends on:
@@ -46,6 +48,42 @@ Some build options are available:
 
 * `BUILD_SHARED_LIBS` (default: `ON`) controls the type of library to build (`ON` for shared libraries, `OFF` for static)
 
+* `USE_HIP` (default: `OFF`) builds for AMD GPUs through ROCm/HIP instead of CUDA (requires a ROCm installation); see "Building for AMD GPUs (ROCm/HIP)" below.
+
+### Building for AMD GPUs (ROCm/HIP)
+
+With a ROCm installation, configure with `-DUSE_HIP=ON` to build the GPU code through HIP instead of CUDA. The CUDA build path is unchanged when `USE_HIP=OFF` (the default).
+
+```shell
+cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a \
+  -DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
+  -DCMAKE_PREFIX_PATH=/opt/rocm
+make
+```
+
+Point `-DCMAKE_PREFIX_PATH` at the ROCm install prefix (`/opt/rocm` for a default install) so CMake can locate the `hip` and `rocThrust` packages when ROCm is not already on the search path.
+
+Set the target architecture with `-DCMAKE_HIP_ARCHITECTURES` (e.g. `gfx90a`, `gfx1100`); pass a semicolon-separated list such as `"gfx90a;gfx1100"` to emit a fat binary.
+
+#### Windows (gfx1151 / TheRock ROCm)
+
+Tested with the ROCm wheels from [TheRock](https://github.com/ROCm/TheRock) (see its README for installing the Windows wheels). The steps below assume `<rocm>` is that install prefix and that `clang` from it is on `PATH`:
+
+```shell
+cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1151 ^
+  -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang -DCMAKE_HIP_COMPILER=clang ^
+  -DCMAKE_PREFIX_PATH=<rocm> -DPopSift_BUILD_EXAMPLES=OFF ^
+  -DCMAKE_LINKER_TYPE=LLDFIX ^
+  -DCMAKE_C_USING_LINKER_LLDFIX=-fuse-ld=lld ^
+  -DCMAKE_CXX_USING_LINKER_LLDFIX=-fuse-ld=lld ^
+  -DCMAKE_HIP_USING_LINKER_LLDFIX=-fuse-ld=lld
+cmake --build . --config Release
+```
+
+The `CMAKE_LINKER_TYPE=LLDFIX` block works around the `-fgpu-rdc` device link: CMake otherwise emits `-fuse-ld=lld-link`, which AMD clang rejects under `--hip-link`. At runtime, copy a matching `amdhip64` and `amd_comgr` from `<rocm>` next to the executable (a System32 Adrenalin-driver runtime can be device-library mismatched).
+
+On the gfx1151 APU, `maxTexture2DLayered` is smaller than on gfx90a, so the default upscale can overflow for large images; use native-resolution downsampling (`Config::setDownsampling(0)`) or cap the input size.
+
 ## Usage
 
 The main artifact created is `libpopsift`.
diff --git a/doc/sphinx/source/install/install.rst b/doc/sphinx/source/install/install.rst
index da6d1dde..0f2d2dc7 100644
--- a/doc/sphinx/source/install/install.rst
+++ b/doc/sphinx/source/install/install.rst
@@ -33,6 +33,8 @@ Software
 
 The core library depends only on Cuda >= 7.0
 
+PopSift also builds for AMD GPUs through ROCm/HIP instead of CUDA, requiring ROCm 7.2 or newer; see the ``USE_HIP`` CMake option below.
+
 The library includes a few sample applications that show how to use the library.
 They require
 
@@ -173,6 +175,8 @@ CMake configuration can be controlled by changing the values of the following va
 
 * :code:`PopSift_BUILD_DOC:BOOL=OFF` to enable/disable building this documentation and the Doxygen one.
 
+* :code:`USE_HIP:BOOL=OFF` to build for AMD GPUs with ROCm/HIP instead of CUDA. Set the target GPU architecture with :code:`CMAKE_HIP_ARCHITECTURES` (for example :code:`gfx90a` for CDNA2 / MI200, or :code:`gfx1100` for RDNA3). Requires a ROCm installation (7.2 or newer) providing HIP.
+
 For example, if you do not want to build the applications, you have to pass :code:`-DPopSift_BUILD_EXAMPLES:BOOL=OFF` and so on.
 
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 11cb5a6a..5bf4620f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -41,14 +41,71 @@ add_library(popsift
         popsift/common/vec_macros.h
         popsift/common/clamp.h)
 
+if(USE_HIP)
+  # Collect the .cu translation units so the HIP build can mark them LANGUAGE HIP.
+  # Under USE_HIP, .cu would otherwise default to CUDA (a language not enabled in
+  # the HIP build), so the language must be set explicitly. This is only needed on
+  # the HIP path, so the collection lives inside the guard.
+  get_target_property(_popsift_sources popsift SOURCES)
+  set(_popsift_cu_sources "")
+  foreach(_src ${_popsift_sources})
+    if(_src MATCHES "\\.cu$")
+      list(APPEND _popsift_cu_sources "${_src}")
+    endif()
+  endforeach()
+  set_source_files_properties(${_popsift_cu_sources} PROPERTIES LANGUAGE HIP)
+endif()
+
 target_link_libraries(popsift
                       PUBLIC
-		      CUDA::cudart
 		      Threads::Threads)
+if(USE_HIP)
+  # rocThrust transitively links hip::device, whose INTERFACE_COMPILE_OPTIONS inject
+  # "-x hip" for COMPILE_LANGUAGE:CXX. Keep that device-compile interface PRIVATE so
+  # it does not leak "-x hip" onto plain-C++ consumers of popsift (the example apps).
+  # Consumers still need the HIP runtime to link and its headers to satisfy the
+  # public <cuda_runtime.h> include, so expose hip::host PUBLIC (link + headers, no
+  # "-x hip"). The library .cu sources are already LANGUAGE HIP, so they get the HIP
+  # compiler regardless of the link visibility.
+  target_link_libraries(popsift PRIVATE roc::rocthrust)
+  target_link_libraries(popsift PUBLIC hip::host)
+else()
+  target_link_libraries(popsift PUBLIC CUDA::cudart)
+endif()
 
 set_target_properties(popsift PROPERTIES VERSION ${PROJECT_VERSION})
 set_target_properties(popsift PROPERTIES DEBUG_POSTFIX "d")
-set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+if(USE_HIP)
+  # CMAKE_HIP_ARCHITECTURES is already resolved by project(... LANGUAGES CXX HIP)
+  # in the top-level CMakeLists: it auto-detects the host GPU arch, honors an
+  # explicit -DCMAKE_HIP_ARCHITECTURES, and errors on a host with no GPU. Pass
+  # -DCMAKE_HIP_ARCHITECTURES to override.
+  set_target_properties(popsift PROPERTIES
+    HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
+  # USE_HIP selects the HIP branch of cuda_to_hip.h. __HIP_PLATFORM_AMD__ is
+  # only defined once the HIP runtime header is included, so it cannot gate the
+  # include of that very header; the explicit define breaks the cycle.
+  target_compile_definitions(popsift PRIVATE USE_HIP)
+  # Force-include the compat header into every HIP translation unit so no
+  # per-file #include edits are needed (minimal footprint).
+  # -fgpu-rdc keeps device code relocatable so the cross-TU __device__/__constant__
+  # globals (d_consts, d_gauss, dct/dbuf/dobuf -- declared extern in headers,
+  # defined in one .cu, used in many) are resolved by the final --hip-link step.
+  # Without it each .o is device-linked standalone and those references are
+  # undefined. The matching -fgpu-rdc on the link rule (below) performs the
+  # device link.
+  target_compile_options(popsift PRIVATE
+    $<$<COMPILE_LANGUAGE:HIP>:-include${CMAKE_CURRENT_SOURCE_DIR}/popsift/cuda_to_hip.h>
+    $<$<COMPILE_LANGUAGE:HIP>:-fgpu-rdc>)
+  target_link_options(popsift PRIVATE -fgpu-rdc --hip-link)
+  # hip_compat/ supplies a <cuda_runtime.h> shim for the sources that include it
+  # directly; HIP build only. PRIVATE so the public popsift.h <cuda_runtime.h>
+  # include still resolves to the real CUDA header for downstream NVIDIA users.
+  target_include_directories(popsift PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/popsift/hip_compat)
+else()
+  set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+endif()
 if(MSVC)
         target_compile_options(popsift PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/Zc:preprocessor>)
 endif()
diff --git a/src/application/CMakeLists.txt b/src/application/CMakeLists.txt
index a6b8886f..32ed796d 100755
--- a/src/application/CMakeLists.txt
+++ b/src/application/CMakeLists.txt
@@ -70,7 +70,24 @@ endif()
 
 add_executable(popsift-demo  main.cpp pgmread.cpp pgmread.h)
 
-set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11)
+if(USE_HIP)
+  # main.cpp/pgmread.cpp have NO device code; they consume popsift's host API only.
+  # popsift now links the HIP device-compile interface (roc::rocthrust -> hip::device,
+  # which injects "-x hip") PRIVATE and exposes only hip::host PUBLIC, so these
+  # sources compile as plain C++ (no "-x hip") and just link the HIP runtime. They
+  # still need the hip_compat <cuda_runtime.h> shim and the cuda_to_hip.h force-include
+  # because popsift's public headers (device_prop.h) #include <cuda_runtime.h>; the
+  # library keeps those PRIVATE, so the consumer adds them explicitly here. USE_HIP
+  # selects the HIP branch of cuda_to_hip.h. C++17 for the compat header.
+  set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 17)
+  target_compile_definitions(popsift-demo PRIVATE USE_HIP)
+  target_include_directories(popsift-demo PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../popsift/hip_compat)
+  target_compile_options(popsift-demo PRIVATE
+    $<$<COMPILE_LANGUAGE:CXX>:-include${CMAKE_CURRENT_SOURCE_DIR}/../popsift/cuda_to_hip.h>)
+else()
+  set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11)
+endif()
 
 target_compile_options(popsift-demo PRIVATE ${PD_COMPILE_OPTIONS} )
 target_include_directories(popsift-demo PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
@@ -84,7 +101,17 @@ target_link_libraries(popsift-demo PUBLIC PopSift::popsift ${PD_LINK_LIBS})
 
 add_executable(popsift-match match.cpp pgmread.cpp pgmread.h)
 
-set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11)
+if(USE_HIP)
+  # See popsift-demo above: plain-C++ host consumer, no device code, no "-x hip".
+  set_property(TARGET popsift-match PROPERTY CXX_STANDARD 17)
+  target_compile_definitions(popsift-match PRIVATE USE_HIP)
+  target_include_directories(popsift-match PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../popsift/hip_compat)
+  target_compile_options(popsift-match PRIVATE
+    $<$<COMPILE_LANGUAGE:CXX>:-include${CMAKE_CURRENT_SOURCE_DIR}/../popsift/cuda_to_hip.h>)
+else()
+  set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11)
+endif()
 
 target_compile_options(popsift-match PRIVATE ${PD_COMPILE_OPTIONS} )
 target_include_directories(popsift-match PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
diff --git a/src/popsift/common/assist.h b/src/popsift/common/assist.h
index 50e6fe6f..1c52c93d 100644
--- a/src/popsift/common/assist.h
+++ b/src/popsift/common/assist.h
@@ -56,6 +56,34 @@ template<typename T> __device__ inline T shuffle_down( T variable, int delta, in
 template<typename T> __device__ inline T shuffle_xor ( T variable, int delta, int ws ) { return __shfl_xor ( variable, delta, ws ); }
 #endif
 
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+/* Wave64 sub-warp helpers.
+ *
+ * These kernels were written for a 32-lane NVIDIA warp. On a 64-lane CDNA
+ * wavefront PopSift packs two logical 32-thread rows into one wavefront, so the
+ * whole-wavefront builtins (__ballot/__any over all 64 lanes) mix the two rows
+ * and miscount. The helpers below restrict a collective to the caller's own
+ * 32-lane group (its lane's half of the wavefront), reproducing 32-lane warp
+ * semantics. The group index is the thread's logical row (threadIdx.y for the
+ * extrema kernel); a single-row 32-thread block is group 0. CUDA is unaffected:
+ * this block compiles only on HIP.
+ */
+__device__ inline unsigned int ballot_group( unsigned int pred, int group )
+{
+    const unsigned long long b = __ballot( pred );
+    return (unsigned int)( b >> ( group * 32 ) );
+}
+__device__ inline int any_group( unsigned int pred, int group )
+{
+    const unsigned long long b = __ballot( pred );
+    return ( (unsigned int)( b >> ( group * 32 ) ) ) != 0u;
+}
+#else
+__device__ inline unsigned int ballot_group( unsigned int pred, int ) { return popsift::ballot( pred ); }
+__device__ inline int any_group( unsigned int pred, int ) { return popsift::any( pred ); }
+#endif
+
+
 /* This computation is needed very frequently when a dim3 grid block is
  * initialized. It ensure that the tail is not forgotten.
  */
@@ -73,8 +101,125 @@ float readTex( cudaTextureObject_t tex, float x, float y, float z )
      * we will get the expected cell (or an interpolation very 
      * close by) iff we add 0.5f to X and Y coordinate.
      */
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    /* Observed on gfx90a, ROCm 7.2.1: hipCreateTextureObject rejects a
+     * hardware-linear-filtered texture over an element-read float array (see
+     * sift_octave.cu), so the linear
+     * textures are created with point filtering and we reproduce CUDA's bilinear
+     * filter in software here. This is an empirical limitation on this device/ROCm;
+     * it may not hold on other arches (re-verify on RDNA).
+     * CUDA's unnormalized linear filter on tex2DLayered(c) samples at index c-0.5,
+     * i.e. i0=floor(c-0.5) with weight frac=(c-0.5)-i0; readTex passes c=x+0.5,
+     * so i0=floor(x), frac=x-floor(x). Point textures (used with integer x,y) hit
+     * frac==0 and reduce to the exact texel, so this path is correct for them too.
+     * SIFT blurs are per-layer; z is always an integer level, so interpolate in x,y
+     * at the fixed layer z. Point-filtered fetch of texel ix is tex2DLayered(ix+0.5).
+     */
+    const float fx = floorf( x );
+    const float fy = floorf( y );
+    const float ax = x - fx;
+    const float ay = y - fy;
+    const float t00 = tex2DLayered<float>( tex, fx + 0.5f, fy + 0.5f, z );
+    const float t10 = tex2DLayered<float>( tex, fx + 1.5f, fy + 0.5f, z );
+    const float t01 = tex2DLayered<float>( tex, fx + 0.5f, fy + 1.5f, z );
+    const float t11 = tex2DLayered<float>( tex, fx + 1.5f, fy + 1.5f, z );
+    const float top = t00 + ax * ( t10 - t00 );
+    const float bot = t01 + ax * ( t11 - t01 );
+    return top + ay * ( bot - top );
+#else
     return tex2DLayered<float>( tex, x+0.5f, y+0.5f, z );
+#endif
+}
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+/* HIP-only: bundle a pyramid array's surface with its per-level width/height.
+ *
+ * Observed on gfx90a/CDNA2 (ROCm 7.2.1): LAYERED images are broken. After a layered
+ * array is written one layer at a time via surf2DLayeredwrite, a read in a later
+ * kernel launch (tex2DLayered, surf2DLayeredread, and even host hipMemcpy3D) returns
+ * one single (last-written) layer's data for EVERY layer index -- the layer
+ * dimension is collapsed. Filed as ROCm/clr#275 (popsift is the motivating case);
+ * a standalone reproducer mirrors that issue's array3d_check. AMD's partial
+ * fix ROCm/rocm-systems#6683 corrects only surf2DLayered; tex2DLayered and
+ * hipMemcpy3D may still collapse the layer dimension independently. A standalone
+ * repro confirms this AND confirms a non-layered 3D array
+ * (surf3Dwrite/surf3Dread/tex3D) is fully coherent across launches. So on
+ * HIP the pyramid arrays are allocated as non-layered 3D arrays (sift_octave.cu)
+ * and every layered access maps to the 3D form with the layer index as the z
+ * coordinate. Reads funnel through readTex below and sample the surface via
+ * surf3Dread; the surface address is a 1:1 match for the surf2DLayeredwrite the
+ * producer kernels issue (now mapped to surf3Dwrite in cuda_to_hip.h), so the
+ * blurred value a producer wrote at (x,y,level) is exactly what the consumer reads.
+ * tex is carried for parity with the CUDA signature but is unused on HIP.
+ * On CUDA this struct is unused (callers pass the texture object straight to the
+ * tex2DLayered readTex above) and the read path is byte-for-byte unchanged.
+ */
+struct LayeredTex
+{
+    cudaTextureObject_t tex;
+    cudaSurfaceObject_t surf;
+    int                 width;
+    int                 height;
+};
+
+/* Point-fetch one texel from the 3D pyramid surface, clamping the integer coords
+ * to [0,width-1]x[0,height-1] to reproduce the cudaAddressModeClamp behaviour of
+ * the point/linear textures these reads replace (an out-of-range surf3Dread
+ * returns 0 on HIP, which would corrupt image borders). The surface x is a byte
+ * offset (sizeof(float)), matching the surf2DLayeredwrite call sites (idx*4); z is
+ * the (integer) blur level.
+ */
+__device__ static inline
+float surfFetchClamped( const LayeredTex& s, int ix, int iy, int layer )
+{
+    ix = ix < 0 ? 0 : ( ix >= s.width  ? s.width  - 1 : ix );
+    iy = iy < 0 ? 0 : ( iy >= s.height ? s.height - 1 : iy );
+    float v;
+    surf3Dread( &v, s.surf, ix * 4, iy, layer );
+    return v;
+}
+
+__device__ static inline
+float readTex( const LayeredTex& s, float x, float y, float z )
+{
+    /* Same -0.5 texel-center bilinear convention as the texture readTex above,
+     * but sampling the coherent 3D surface instead of the broken layered texture.
+     * SIFT blurs are per-level, so z is always an integer level: interpolate in
+     * x,y at the fixed z slice (no z interpolation), matching the CUDA layered
+     * point/linear texture fetch.
+     */
+    const int   layer = (int)z;
+    const float fx = floorf( x );
+    const float fy = floorf( y );
+    const float ax = x - fx;
+    const float ay = y - fy;
+    const int   ix = (int)fx;
+    const int   iy = (int)fy;
+    const float t00 = surfFetchClamped( s, ix,   iy,   layer );
+    const float t10 = surfFetchClamped( s, ix+1, iy,   layer );
+    const float t01 = surfFetchClamped( s, ix,   iy+1, layer );
+    const float t11 = surfFetchClamped( s, ix+1, iy+1, layer );
+    const float top = t00 + ax * ( t10 - t00 );
+    const float bot = t01 + ax * ( t11 - t01 );
+    return top + ay * ( bot - top );
 }
+#endif
+
+/* Source handle for a pyramid-array layered read funnelled through readTex.
+ * On HIP it is a LayeredTex (coherent 3D surface + dims); on CUDA it is the plain
+ * texture object, so kernel signatures and the read path stay identical to upstream
+ * on CUDA. Build it at the launch site with POPSIFT_LAYERED_SRC, which on CUDA
+ * expands to exactly the texture argument (no behavioural change) and on HIP packs
+ * the array's surface and per-level width/height for surf3Dread.
+ */
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+using LayeredReadTex = LayeredTex;
+#define POPSIFT_LAYERED_SRC( tex, surf, width, height ) \
+    ::popsift::LayeredTex{ (tex), (surf), (width), (height) }
+#else
+using LayeredReadTex = cudaTextureObject_t;
+#define POPSIFT_LAYERED_SRC( tex, surf, width, height ) (tex)
+#endif
 
 __device__ static inline
 float readTex( cudaTextureObject_t tex, float x, float y )
diff --git a/src/popsift/common/excl_blk_prefix_sum.h b/src/popsift/common/excl_blk_prefix_sum.h
index d77bf7b0..7e37b76d 100644
--- a/src/popsift/common/excl_blk_prefix_sum.h
+++ b/src/popsift/common/excl_blk_prefix_sum.h
@@ -90,9 +90,17 @@ class Block
             int ews = 0; // exclusive warp prefix sum
             int self = (valid) ? _reader.get(cell) : 0;
 
-            // This loop is an exclusive prefix sum for one warp
+            // This loop is an exclusive prefix sum for one warp. The block is
+            // (32,blockDim.y), one warp per threadIdx.y row. On a 64-lane
+            // wavefront two rows share a wavefront, so the scan shuffle must be
+            // confined to a width-32 sub-group (threadIdx.x is the in-row lane id)
+            // or odd rows pull partial sums from the wrong row. CUDA: unchanged.
             for( int s=0; s<5; s++ ) {
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+                const int add = popsift::shuffle_up( ews+self, 1<<s, 32 );
+#else
                 const int add = popsift::shuffle_up( ews+self, 1<<s );
+#endif
                 ews += threadIdx.x < (1<<s) ? 0 : add;
             }
 
@@ -109,7 +117,11 @@ class Block
                 int self = sum[threadIdx.x];
 
                 for( int s=0; s<5; s++ ) {
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+                    const int add = popsift::shuffle_up( ebs+self, 1<<s, 32 );
+#else
                     const int add = popsift::shuffle_up( ebs+self, 1<<s );
+#endif
                     ebs += threadIdx.x < (1<<s) ? 0 : add;
                 }
 
diff --git a/src/popsift/common/warp_bitonic_sort.h b/src/popsift/common/warp_bitonic_sort.h
index 26ea5232..23157a6b 100644
--- a/src/popsift/common/warp_bitonic_sort.h
+++ b/src/popsift/common/warp_bitonic_sort.h
@@ -57,16 +57,32 @@ class Warp32
     __device__ inline
     int shiftit( const int my_index, const int shift, const int direction, const bool increasing )
     {
+        // This is a 32-lane bitonic step. On a 64-lane wavefront ori_par runs as
+        // a single 32-thread block (lanes 0-31), and sort64 holds 2 elements per
+        // lane; the shuffles must stay inside a 32-lane group, so force width 32
+        // on HIP (a no-op on CUDA's 32-lane warp) and key the swap direction off
+        // the in-group lane id.
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+        const int  lane_id     = threadIdx.x & 31;
+        const T    my_val      = _array[my_index];
+        const T    other_val   = popsift::shuffle_xor( my_val, 1 << shift, 32 );
+#else
+        const int  lane_id     = threadIdx.x;
         const T    my_val      = _array[my_index];
         const T    other_val   = popsift::shuffle_xor( my_val, 1 << shift );
-        const bool reverse     = ( threadIdx.x & ( 1 << direction ) );
-        const bool id_less     = ( ( threadIdx.x & ( 1 << shift ) ) == 0 );
+#endif
+        const bool reverse     = ( lane_id & ( 1 << direction ) );
+        const bool id_less     = ( ( lane_id & ( 1 << shift ) ) == 0 );
         const bool my_more     = id_less ? ( my_val > other_val )
                                          : ( my_val < other_val );
         const bool must_swap   = ! ( my_more ^ reverse ^ increasing );
 
         int lane = must_swap ? ( 1 << shift ) : 0;
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+        return popsift::shuffle_xor( my_index, lane, 32 );
+#else
         return popsift::shuffle_xor( my_index, lane );
+#endif
     }
 
     __device__ inline
diff --git a/src/popsift/cuda_to_hip.h b/src/popsift/cuda_to_hip.h
new file mode 100644
index 00000000..b4a24894
--- /dev/null
+++ b/src/popsift/cuda_to_hip.h
@@ -0,0 +1,209 @@
+//********************************************************//
+// CUDA-to-HIP compatibility shim for PopSift             //
+//                                                        //
+// Minimal-footprint port: every other                    //
+// source file keeps its plain CUDA spelling. On AMD this  //
+// header includes the HIP runtime and #defines the CUDA   //
+// symbols the project uses to their HIP equivalents. On   //
+// NVIDIA it is a no-op that pulls in <cuda_runtime.h>.    //
+//                                                        //
+// Symbol names follow PyTorch's authoritative hipify map: //
+// torch/utils/hipify/cuda_to_hip_mappings.py             //
+//********************************************************//
+
+#ifndef POPSIFT_CUDA_TO_HIP_H
+#define POPSIFT_CUDA_TO_HIP_H
+
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+
+#include <hip/hip_runtime.h>
+// Backward-compat note (ROCm 7.2.x -> newer ROCm): newer ROCm's
+// <hip/hip_bf16.h> defines real __shfl_*_sync<...> functions whose names collide
+// with the function-like __shfl_*_sync macros defined further below -- if a
+// rocThrust/rocprim include pulls that header in AFTER the macros, the macros
+// rewrite the header's own definitions and it fails to compile ("use of
+// undeclared identifier 'mask'"). Pull the header in FIRST so the real functions
+// are defined before the macros. On older ROCm (7.2.x) the header exists and has
+// no such functions, so this is a harmless no-op there -- guarded with
+// __has_include so it also tolerates any ROCm that ships without the header.
+// Only needed where rocThrust/rocprim is pulled in (the device .cu TUs, compiled
+// by HIP clang); gate on __clang__ so the plain-C++ host example consumers (which
+// may be built by gcc and never include rocThrust) do not try to parse hip_bf16.h,
+// whose vector intrinsics require clang builtins.
+#if defined(__clang__) && defined(__has_include)
+#  if __has_include(<hip/hip_bf16.h>)
+#    include <hip/hip_bf16.h>
+#  endif
+#endif
+
+// ---- Error handling ----
+#define cudaError_t            hipError_t
+#define cudaSuccess            hipSuccess
+#define cudaGetErrorString     hipGetErrorString
+#define cudaGetLastError       hipGetLastError
+#define cudaPeekAtLastError    hipPeekAtLastError
+#define cudaDeviceSynchronize  hipDeviceSynchronize
+#define cudaDeviceReset        hipDeviceReset
+
+// ---- Device management ----
+#define cudaGetDeviceCount       hipGetDeviceCount
+#define cudaGetDevice            hipGetDevice
+#define cudaGetDeviceProperties  hipGetDeviceProperties
+#define cudaSetDevice            hipSetDevice
+#define cudaDeviceProp           hipDeviceProp_t
+#define cudaDeviceSetLimit       hipDeviceSetLimit
+
+// ---- Events / streams ----
+#define cudaEvent_t            hipEvent_t
+#define cudaEventCreate        hipEventCreate
+#define cudaEventDestroy       hipEventDestroy
+#define cudaEventRecord        hipEventRecord
+#define cudaEventSynchronize   hipEventSynchronize
+#define cudaEventElapsedTime   hipEventElapsedTime
+#define cudaStream_t           hipStream_t
+#define cudaStreamCreate       hipStreamCreate
+#define cudaStreamDestroy      hipStreamDestroy
+#define cudaStreamSynchronize  hipStreamSynchronize
+#define cudaStreamWaitEvent    hipStreamWaitEvent
+
+// ---- Linear / pitched / host memory ----
+#define cudaMalloc         hipMalloc
+#define cudaMallocManaged  hipMallocManaged
+#define cudaMallocHost     hipHostMalloc
+#define cudaFree           hipFree
+#define cudaFreeHost       hipHostFree
+#define cudaMallocPitch    hipMallocPitch
+#define cudaHostRegister   hipHostRegister
+#define cudaHostUnregister hipHostUnregister
+#define cudaHostRegisterPortable hipHostRegisterPortable
+#define cudaHostRegisterMapped   hipHostRegisterMapped
+
+#define cudaMemcpy         hipMemcpy
+#define cudaMemcpyAsync    hipMemcpyAsync
+#define cudaMemcpy2D       hipMemcpy2D
+#define cudaMemcpy2DAsync  hipMemcpy2DAsync
+#define cudaMemset         hipMemset
+#define cudaMemsetAsync    hipMemsetAsync
+
+// ---- 3D / layered memory (Gaussian pyramid arrays) ----
+#define cudaMemcpy3D           hipMemcpy3D
+#define cudaMemcpy3DParms      hipMemcpy3DParms
+#define cudaPitchedPtr         hipPitchedPtr
+#define cudaExtent             hipExtent
+#define cudaPos                hipPos
+#define make_cudaPitchedPtr    make_hipPitchedPtr
+#define make_cudaExtent        make_hipExtent
+#define make_cudaPos           make_hipPos
+
+// ---- Constant / symbol memory ----
+#define cudaMemcpyToSymbol         hipMemcpyToSymbol
+#define cudaMemcpyToSymbolAsync    hipMemcpyToSymbolAsync
+#define cudaMemcpyFromSymbol       hipMemcpyFromSymbol
+#define cudaMemcpyFromSymbolAsync  hipMemcpyFromSymbolAsync
+#define cudaGetSymbolAddress       hipGetSymbolAddress
+
+// ---- memcpy kinds ----
+#define cudaMemcpyKind            hipMemcpyKind
+#define cudaMemcpyHostToDevice    hipMemcpyHostToDevice
+#define cudaMemcpyDeviceToHost    hipMemcpyDeviceToHost
+#define cudaMemcpyDeviceToDevice  hipMemcpyDeviceToDevice
+#define cudaMemcpyHostToHost      hipMemcpyHostToHost
+
+// ---- CUDA arrays + channel format (texture/surface backing store) ----
+#define cudaArray                    hipArray
+#define cudaArray_t                  hipArray_t
+#define cudaMallocArray              hipMallocArray
+#define cudaMalloc3DArray            hipMalloc3DArray
+#define cudaFreeArray                hipFreeArray
+#define cudaArrayLayered             hipArrayLayered
+#define cudaArraySurfaceLoadStore    hipArraySurfaceLoadStore
+#define cudaArrayDefault             hipArrayDefault
+#define cudaChannelFormatDesc        hipChannelFormatDesc
+#define cudaCreateChannelDesc        hipCreateChannelDesc
+#define cudaChannelFormatKindFloat    hipChannelFormatKindFloat
+#define cudaChannelFormatKindUnsigned hipChannelFormatKindUnsigned
+#define cudaChannelFormatKindSigned   hipChannelFormatKindSigned
+
+// ---- Texture objects ----
+#define cudaTextureObject_t       hipTextureObject_t
+#define cudaCreateTextureObject   hipCreateTextureObject
+#define cudaDestroyTextureObject  hipDestroyTextureObject
+#define cudaResourceDesc          hipResourceDesc
+#define cudaTextureDesc           hipTextureDesc
+#define cudaResourceTypePitch2D   hipResourceTypePitch2D
+#define cudaResourceTypeArray     hipResourceTypeArray
+#define cudaResourceTypeLinear    hipResourceTypeLinear
+#define cudaAddressModeClamp      hipAddressModeClamp
+#define cudaAddressModeWrap       hipAddressModeWrap
+#define cudaFilterModeLinear      hipFilterModeLinear
+#define cudaFilterModePoint       hipFilterModePoint
+#define cudaReadModeElementType        hipReadModeElementType
+#define cudaReadModeNormalizedFloat    hipReadModeNormalizedFloat
+#define cudaTextureType2D         hipTextureType2D
+#define cudaTextureType2DLayered  hipTextureType2DLayered
+
+// ---- Surface objects (pyramid write path) ----
+#define cudaSurfaceObject_t       hipSurfaceObject_t
+#define cudaCreateSurfaceObject   hipCreateSurfaceObject
+#define cudaDestroySurfaceObject  hipDestroySurfaceObject
+#define cudaBoundaryModeZero      hipBoundaryModeZero
+#define cudaBoundaryModeClamp     hipBoundaryModeClamp
+#define cudaBoundaryModeTrap      hipBoundaryModeTrap
+
+// HIP layered-image coherency is broken on gfx90a/CDNA2 (observed ROCm 7.2.1),
+// filed as ROCm/clr#275 (the partial fix ROCm/rocm-systems#6683 covers only
+// surf2DLayered): after a layered array is written layer-by-layer via
+// surf2DLayeredwrite, a read in a later kernel launch (tex2DLayered OR
+// surf2DLayeredread, host hipMemcpy3D too) returns a single (last-written) layer's
+// data for EVERY layer index -- the layer dimension is effectively collapsed.
+// A standalone reproducer confirms it and also
+// confirms a NON-layered 3D array (surf3Dwrite/surf3Dread/tex3D) is fully
+// coherent across launches. So on HIP the pyramid arrays are allocated as
+// non-layered 3D arrays (see sift_octave.cu) and the layered builtins map to the
+// 3D ones with the layer index used as the z coordinate (a 1:1 mapping: the
+// element write/read addresses are identical). CUDA keeps real layered arrays and
+// the native builtins, byte-for-byte unchanged.
+//
+// HIP's surf3Dwrite has no boundary-mode parameter (CUDA's surf2DLayeredwrite has
+// a trailing 6th arg); hipBoundaryModeZero matches the AMD image-store default
+// (out-of-range writes dropped), so dropping the argument is faithful.
+template <typename T>
+__device__ __forceinline__ void popsift_surf2DLayeredwrite(
+    T data, hipSurfaceObject_t surfObj, int x, int y, int layer,
+    int /*boundaryMode*/)
+{
+    surf3Dwrite(data, surfObj, x, y, layer);
+}
+#define surf2DLayeredwrite(data, surf, x, y, layer, mode) \
+    popsift_surf2DLayeredwrite((data), (surf), (x), (y), (layer), (mode))
+
+// ---- Directed-rounding FP intrinsics ----
+// HIP/AMD does not provide the round-toward-+infinity intrinsics. PopSift's only
+// uses are descriptor weight accumulation where the operands are non-negative and
+// the comment notes _ru is merely a fast round; the round-to-nearest form is the
+// faithful HIP equivalent.
+#define __fmaf_ru(a, b, c)  __fmaf_rn((a), (b), (c))
+#define __fmul_ru(a, b)     __fmul_rn((a), (b))
+
+// ---- Warp intrinsics ----
+// PopSift (assist.h) passes the CUDA 32-bit full mask 0xffffffff to the *_sync
+// builtins. On a 64-lane CDNA wavefront a uint32 mask is meaningless; HIP's
+// mask-free builtins poll the whole active wavefront, which is the faithful
+// equivalent of "operate over the (sub)warp" on the blocks PopSift launches.
+// Map the _sync forms (both the 2-arg and the explicit-width 3/4-arg overloads
+// resolve to these) to the mask-free HIP builtins.
+#define __shfl_sync(mask, ...)        __shfl(__VA_ARGS__)
+#define __shfl_up_sync(mask, ...)     __shfl_up(__VA_ARGS__)
+#define __shfl_down_sync(mask, ...)   __shfl_down(__VA_ARGS__)
+#define __shfl_xor_sync(mask, ...)    __shfl_xor(__VA_ARGS__)
+#define __ballot_sync(mask, pred)     __ballot(pred)
+#define __any_sync(mask, pred)        __any(pred)
+#define __all_sync(mask, pred)        __all(pred)
+
+#else // NVIDIA / CUDA
+
+#include <cuda_runtime.h>
+
+#endif
+
+#endif // POPSIFT_CUDA_TO_HIP_H
diff --git a/src/popsift/features.cu b/src/popsift/features.cu
index 5aa706a1..8f6db5e2 100755
--- a/src/popsift/features.cu
+++ b/src/popsift/features.cu
@@ -175,11 +175,22 @@ l2_in_t0( const float4* lptr, const float4* rptr )
 	        + mval.y * mval.y
 	        + mval.z * mval.z
 	        + mval.w * mval.w;
+    // 32-lane reduction over threadIdx.x (compute_distance is a 32-thread block).
+    // Force a width-32 sub-group on a 64-lane wavefront so the inactive upper
+    // half is never sampled. CUDA: width 32 is the whole warp, unchanged.
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    res += shuffle_down( res, 16, 32 );
+    res += shuffle_down( res,  8, 32 );
+    res += shuffle_down( res,  4, 32 );
+    res += shuffle_down( res,  2, 32 );
+    res += shuffle_down( res,  1, 32 );
+#else
     res += shuffle_down( res, 16 );
     res += shuffle_down( res,  8 );
     res += shuffle_down( res,  4 );
     res += shuffle_down( res,  2 );
     res += shuffle_down( res,  1 );
+#endif
     return res;
 }
 
diff --git a/src/popsift/hip_compat/cuda_runtime.h b/src/popsift/hip_compat/cuda_runtime.h
new file mode 100644
index 00000000..6243bd28
--- /dev/null
+++ b/src/popsift/hip_compat/cuda_runtime.h
@@ -0,0 +1,5 @@
+// HIP-only shim so source that does `#include <cuda_runtime.h>` resolves on
+// ROCm (where no CUDA headers exist). This directory is added to the include
+// path ONLY for the HIP build; the NVIDIA build uses the real CUDA header.
+#pragma once
+#include "../cuda_to_hip.h"
diff --git a/src/popsift/hip_compat/math_constants.h b/src/popsift/hip_compat/math_constants.h
new file mode 100644
index 00000000..ffc01ee2
--- /dev/null
+++ b/src/popsift/hip_compat/math_constants.h
@@ -0,0 +1,16 @@
+// HIP-only shim for CUDA's <math_constants.h>. ROCm ships no equivalent header,
+// so provide the CUDART_* constants PopSift uses, with CUDA's values. On the
+// HIP include path only; the NVIDIA build uses the real CUDA header.
+#pragma once
+
+#include <math.h>
+
+#ifndef CUDART_INF_F
+#define CUDART_INF_F  __int_as_float(0x7f800000)
+#endif
+#ifndef CUDART_NAN_F
+#define CUDART_NAN_F  __int_as_float(0x7fffffff)
+#endif
+#ifndef CUDART_PI_F
+#define CUDART_PI_F   3.141592654f
+#endif
diff --git a/src/popsift/s_desc_grid.cu b/src/popsift/s_desc_grid.cu
index 099c4709..1e6f25ef 100644
--- a/src/popsift/s_desc_grid.cu
+++ b/src/popsift/s_desc_grid.cu
@@ -21,7 +21,7 @@ void ext_desc_grid_sub( const int           ix,
                         const float         ang,
                         const Extremum*     ext,
                         float* __restrict__ features,
-                        cudaTextureObject_t layer_tex )
+                        LayeredReadTex      layer_tex )
 {
     const int tile = ( ( ( iy << 2 ) + ix ) << 3 ); // base of the 8 floats written by this group of 16 threads
 
@@ -121,7 +121,7 @@ void ext_desc_grid_sub( const int           ix,
     }
 }
 
-__global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex)
+__global__ void ext_desc_grid(int octave, LayeredReadTex layer_tex)
 {
     const int   o_offset =  dct.ori_ps[octave] + blockIdx.x;
     const int   ix       = threadIdx.y;
diff --git a/src/popsift/s_desc_grid.h b/src/popsift/s_desc_grid.h
index c0919806..63e3a991 100644
--- a/src/popsift/s_desc_grid.h
+++ b/src/popsift/s_desc_grid.h
@@ -6,6 +6,7 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
+#include "common/assist.h"
 #include "common/debug_macros.h"
 #include "common/plane_2d.h"
 #include "sift_extremum.h"
@@ -17,7 +18,7 @@
  * block = 16,4,4
  * grid  = nunmber of orientations
  */
-__global__ void ext_desc_grid(int octave, cudaTextureObject_t layer_tex);
+__global__ void ext_desc_grid(int octave, popsift::LayeredReadTex layer_tex);
 
 namespace popsift
 {
@@ -39,7 +40,10 @@ inline static bool start_ext_desc_grid( const int octave, Octave& oct_obj )
     ext_desc_grid
         <<<grid,block,0,oct_obj.getStream()>>>
         ( octave,
-          oct_obj.getDataTexPoint( ) );
+          POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ),
+                               oct_obj.getDataSurface( ),
+                               oct_obj.getWidth(),
+                               oct_obj.getHeight() ) );
 
     POP_SYNC_CHK;
 
diff --git a/src/popsift/s_desc_igrid.cu b/src/popsift/s_desc_igrid.cu
index 9f77f12f..eba3c50c 100644
--- a/src/popsift/s_desc_igrid.cu
+++ b/src/popsift/s_desc_igrid.cu
@@ -20,7 +20,7 @@ void ext_desc_igrid_sub( const float x, const float y, const int level,
                          const float cos_t, const float sin_t, const float SBP,
                          const Extremum*     ext,
                          float* __restrict__ features,
-                         cudaTextureObject_t texLinear )
+                         LayeredReadTex      texLinear )
 {
     const int ix   = threadIdx.y & 3;
     const int iy   = threadIdx.y / 4;
@@ -74,7 +74,7 @@ void ext_desc_igrid_sub( const float x, const float y, const int level,
     }
 }
 
-__global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear)
+__global__ void ext_desc_igrid(int octave, LayeredReadTex texLinear)
 {
     const int   num      = dct.ori_ct[octave];
 
diff --git a/src/popsift/s_desc_igrid.h b/src/popsift/s_desc_igrid.h
index 8980a4bc..7bed27df 100644
--- a/src/popsift/s_desc_igrid.h
+++ b/src/popsift/s_desc_igrid.h
@@ -6,6 +6,7 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
+#include "common/assist.h"
 #include "common/debug_macros.h"
 #include "sift_extremum.h"
 #include "sift_octave.h"
@@ -16,7 +17,7 @@
  * block = 16,4,4 or with 32,4,4, depending on macros
  * grid  = nunmber of orientations
  */
-__global__ void ext_desc_igrid(int octave, cudaTextureObject_t texLinear);
+__global__ void ext_desc_igrid(int octave, popsift::LayeredReadTex texLinear);
 
 namespace popsift
 {
@@ -40,7 +41,10 @@ inline static bool start_ext_desc_igrid( const int octave, Octave& oct_obj )
     ext_desc_igrid
         <<<grid,block,0,oct_obj.getStream()>>>
         ( octave,
-          oct_obj.getDataTexLinear( ).tex );
+          POPSIFT_LAYERED_SRC( oct_obj.getDataTexLinear( ).tex,
+                               oct_obj.getDataSurface( ),
+                               oct_obj.getWidth(),
+                               oct_obj.getHeight() ) );
 
     POP_SYNC_CHK;
 
diff --git a/src/popsift/s_desc_iloop.cu b/src/popsift/s_desc_iloop.cu
index 84673a20..0d8ed08c 100644
--- a/src/popsift/s_desc_iloop.cu
+++ b/src/popsift/s_desc_iloop.cu
@@ -19,7 +19,7 @@ __device__ static inline
 void ext_desc_iloop_sub( const float         ang,
                         const Extremum*     ext,
                         float* __restrict__ features,
-                        cudaTextureObject_t layer_tex,
+                        LayeredReadTex      layer_tex,
                         const int           width,
                         const int           height )
 {
@@ -114,6 +114,18 @@ void ext_desc_iloop_sub( const float         ang,
     dpt[0] += dpt[8];
 
     /* reduction here */
+    // 32-lane reduction over threadIdx.x; confine the shuffles to a width-32
+    // sub-group on a 64-lane wavefront (see s_desc_loop.cu). CUDA unchanged.
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    for (int i = 0; i < 8; i++) {
+        dpt[i] += popsift::shuffle_down( dpt[i], 16, 32 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 8, 32 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 4, 32 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 2, 32 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 1, 32 );
+        dpt[i]  = popsift::shuffle     ( dpt[i], 0, 32 );
+    }
+#else
     for (int i = 0; i < 8; i++) {
         dpt[i] += popsift::shuffle_down( dpt[i], 16 );
         dpt[i] += popsift::shuffle_down( dpt[i], 8 );
@@ -122,13 +134,14 @@ void ext_desc_iloop_sub( const float         ang,
         dpt[i] += popsift::shuffle_down( dpt[i], 1 );
         dpt[i]  = popsift::shuffle     ( dpt[i], 0 );
     }
+#endif
 
     if( threadIdx.x < 8 ) {
         features[tile+threadIdx.x] = dpt[threadIdx.x];
     }
 }
 
-__global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int w, int h)
+__global__ void ext_desc_iloop(int octave, LayeredReadTex layer_tex, int w, int h)
 {
     const int   o_offset =  dct.ori_ps[octave] + blockIdx.x;
     Descriptor* desc     = &dbuf.desc           [o_offset];
diff --git a/src/popsift/s_desc_iloop.h b/src/popsift/s_desc_iloop.h
index e69409b6..03a020ab 100644
--- a/src/popsift/s_desc_iloop.h
+++ b/src/popsift/s_desc_iloop.h
@@ -6,13 +6,14 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
+#include "common/assist.h"
 #include "common/debug_macros.h"
 #include "common/plane_2d.h"
 #include "sift_extremum.h"
 #include "sift_octave.h"
 #include "sift_pyramid.h"
 
-__global__ void ext_desc_iloop(int octave, cudaTextureObject_t layer_tex, int width, int height);
+__global__ void ext_desc_iloop(int octave, popsift::LayeredReadTex layer_tex, int width, int height);
 
 namespace popsift
 {
@@ -34,7 +35,10 @@ inline static bool start_ext_desc_iloop( const int octave, Octave& oct_obj )
     ext_desc_iloop
         <<<grid,block,0,oct_obj.getStream()>>>
         ( octave,
-          oct_obj.getDataTexLinear( ).tex,
+          POPSIFT_LAYERED_SRC( oct_obj.getDataTexLinear( ).tex,
+                               oct_obj.getDataSurface( ),
+                               oct_obj.getWidth(),
+                               oct_obj.getHeight() ),
           oct_obj.getWidth(),
           oct_obj.getHeight() );
 
diff --git a/src/popsift/s_desc_loop.cu b/src/popsift/s_desc_loop.cu
index 4c5f46c2..d58a46f9 100644
--- a/src/popsift/s_desc_loop.cu
+++ b/src/popsift/s_desc_loop.cu
@@ -19,7 +19,7 @@ __device__ static inline
 void ext_desc_loop_sub( const float         ang,
                         const Extremum*     ext,
                         float* __restrict__ features,
-                        cudaTextureObject_t layer_tex,
+                        LayeredReadTex      layer_tex,
                         const int           width,
                         const int           height )
 {
@@ -124,6 +124,21 @@ void ext_desc_loop_sub( const float         ang,
     dpt[0] += dpt[8];
 
     /* reduction here */
+    // 32-lane reduction over threadIdx.x. The block is (32,4,4): each (y,z) pair
+    // is its own 32-thread group, so on a 64-lane wavefront (two groups per
+    // wavefront) the shuffles must be confined to a width-32 sub-group, else the
+    // reduction and the lane-0 broadcast leak across the group boundary and
+    // corrupt half the descriptors (NaN). On CUDA width 32 is the whole warp.
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    for (int i = 0; i < 8; i++) {
+        dpt[i] += popsift::shuffle_down( dpt[i], 16, 32 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 8, 32 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 4, 32 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 2, 32 );
+        dpt[i] += popsift::shuffle_down( dpt[i], 1, 32 );
+        dpt[i]  = popsift::shuffle     ( dpt[i], 0, 32 );
+    }
+#else
     for (int i = 0; i < 8; i++) {
         dpt[i] += popsift::shuffle_down( dpt[i], 16 );
         dpt[i] += popsift::shuffle_down( dpt[i], 8 );
@@ -132,13 +147,14 @@ void ext_desc_loop_sub( const float         ang,
         dpt[i] += popsift::shuffle_down( dpt[i], 1 );
         dpt[i]  = popsift::shuffle     ( dpt[i], 0 );
     }
+#endif
 
     if( threadIdx.x < 8 ) {
         features[tile+threadIdx.x] = dpt[threadIdx.x];
     }
 }
 
-__global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int w, int h)
+__global__ void ext_desc_loop(int octave, LayeredReadTex layer_tex, int w, int h)
 {
     const int   o_offset =  dct.ori_ps[octave] + blockIdx.x;
     Descriptor* desc     = &dbuf.desc           [o_offset];
diff --git a/src/popsift/s_desc_loop.h b/src/popsift/s_desc_loop.h
index 600db498..b7e6084b 100644
--- a/src/popsift/s_desc_loop.h
+++ b/src/popsift/s_desc_loop.h
@@ -6,6 +6,7 @@
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
 #pragma once
+#include "common/assist.h"
 #include "common/debug_macros.h"
 #include "common/plane_2d.h"
 #include "sift_extremum.h"
@@ -14,7 +15,7 @@
 
 #undef BLOCK_3_DIMS
 
-__global__ void ext_desc_loop(int octave, cudaTextureObject_t layer_tex, int width, int height);
+__global__ void ext_desc_loop(int octave, popsift::LayeredReadTex layer_tex, int width, int height);
 
 namespace popsift
 {
@@ -42,7 +43,10 @@ inline static bool start_ext_desc_loop( const int octave, Octave& oct_obj )
     ext_desc_loop
         <<<grid,block,0,oct_obj.getStream()>>>
         ( octave,
-          oct_obj.getDataTexPoint( ),
+          POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ),
+                               oct_obj.getDataSurface( ),
+                               oct_obj.getWidth(),
+                               oct_obj.getHeight() ),
           oct_obj.getWidth(),
           oct_obj.getHeight() );
 
diff --git a/src/popsift/s_desc_norm_l2.h b/src/popsift/s_desc_norm_l2.h
index b067d71f..d1f42b9f 100644
--- a/src/popsift/s_desc_norm_l2.h
+++ b/src/popsift/s_desc_norm_l2.h
@@ -58,19 +58,41 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
          + descr.y * descr.y
          + descr.z * descr.z
          + descr.w * descr.w;
+    // 32-lane reduction over threadIdx.x. The normalize block is (32,32): each
+    // threadIdx.y row is one descriptor. On a 64-lane wavefront two rows share a
+    // wavefront, so the reduction and the lane-0 broadcast must stay inside a
+    // width-32 sub-group, else rows cross-contaminate and the descriptor norm is
+    // wrong (NaN). CUDA: width 32 is the whole warp, unchanged.
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    norm += popsift::shuffle_down( norm, 16, 32 );
+    norm += popsift::shuffle_down( norm,  8, 32 );
+    norm += popsift::shuffle_down( norm,  4, 32 );
+    norm += popsift::shuffle_down( norm,  2, 32 );
+    norm += popsift::shuffle_down( norm,  1, 32 );
+#else
     norm += popsift::shuffle_down( norm, 16 );
     norm += popsift::shuffle_down( norm,  8 );
     norm += popsift::shuffle_down( norm,  4 );
     norm += popsift::shuffle_down( norm,  2 );
     norm += popsift::shuffle_down( norm,  1 );
+#endif
 
     if( threadIdx.x == 0 ) {
         // compute 1 / sqrt(sum) in round-to-nearest even mode in thread 0
         norm = __frsqrt_rn( norm );
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+        // A fully flat gradient window yields an all-zero descriptor; __frsqrt_rn(0)
+        // is +inf, and 0*inf below is NaN. Keep such degenerate descriptors all-zero.
+        if( ! isfinite( norm ) ) norm = 0.0f;
+#endif
     }
 
     // spread the inverted norm from thread 0 to all threads in the warp
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    norm = popsift::shuffle( norm,  0, 32 );
+#else
     norm = popsift::shuffle( norm,  0 );
+#endif
 
     // quasi-normalize all 128 floats
     descr.x = min( descr.x*norm, 0.2f );
@@ -86,18 +108,33 @@ void NormalizeL2::normalize( const float* src_desc, float* dst_desc, const bool
          + descr.y * descr.y
          + descr.z * descr.z
          + descr.w * descr.w;
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    norm += popsift::shuffle_down( norm, 16, 32 );
+    norm += popsift::shuffle_down( norm,  8, 32 );
+    norm += popsift::shuffle_down( norm,  4, 32 );
+    norm += popsift::shuffle_down( norm,  2, 32 );
+    norm += popsift::shuffle_down( norm,  1, 32 );
+#else
     norm += popsift::shuffle_down( norm, 16 );
     norm += popsift::shuffle_down( norm,  8 );
     norm += popsift::shuffle_down( norm,  4 );
     norm += popsift::shuffle_down( norm,  2 );
     norm += popsift::shuffle_down( norm,  1 );
+#endif
 
     if( threadIdx.x == 0 ) {
         norm = __frsqrt_rn( norm ); // inverse square root
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+        if( ! isfinite( norm ) ) norm = 0.0f; // see above: avoid 0*inf NaN
+#endif
         norm = scalbnf( norm, d_consts.norm_multi );
     }
 
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    norm = popsift::shuffle( norm,  0, 32 );
+#else
     norm = popsift::shuffle( norm,  0 );
+#endif
 
     descr.x = descr.x * norm;
     descr.y = descr.y * norm;
diff --git a/src/popsift/s_desc_norm_rs.h b/src/popsift/s_desc_norm_rs.h
index 3ab5b1fc..fd3f7100 100644
--- a/src/popsift/s_desc_norm_rs.h
+++ b/src/popsift/s_desc_norm_rs.h
@@ -48,6 +48,18 @@ void NormalizeRootSift::normalize( const float* src_desc, float* dst_desc, bool
 
     float sum = descr.x + descr.y + descr.z + descr.w;
 
+    // 32-lane reduction over threadIdx.x; confine to a width-32 sub-group on a
+    // 64-lane wavefront (normalize block is (32,32), one descriptor per row).
+    // CUDA unchanged.
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    sum += popsift::shuffle_down( sum, 16, 32 );
+    sum += popsift::shuffle_down( sum,  8, 32 );
+    sum += popsift::shuffle_down( sum,  4, 32 );
+    sum += popsift::shuffle_down( sum,  2, 32 );
+    sum += popsift::shuffle_down( sum,  1, 32 );
+
+    sum = popsift::shuffle( sum,  0, 32 );
+#else
     sum += popsift::shuffle_down( sum, 16 );
     sum += popsift::shuffle_down( sum,  8 );
     sum += popsift::shuffle_down( sum,  4 );
@@ -55,7 +67,28 @@ void NormalizeRootSift::normalize( const float* src_desc, float* dst_desc, bool
     sum += popsift::shuffle_down( sum,  1 );
 
     sum = popsift::shuffle( sum,  0 );
+#endif
 
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    // RootSift takes sqrt(bin/sum). The fmaxf(.,0) below clamps a bin that came out
+    // slightly negative from the descriptor accumulation's round-toward-+inf
+    // intrinsics (mapped to round-to-nearest in cuda_to_hip.h; a negative bin is
+    // unphysical). The divisor is gated at a small threshold so a degenerate
+    // near-zero sum is treated as an all-zero descriptor: an all-flat window
+    // (sum==0) normalizes to 0, and a tiny subnormal sum can no longer make 1/sum
+    // overflow to +inf (which would normalize a positive bin to +inf). CUDA, where
+    // the directed-rounding intrinsics exist and bins stay non-negative, is unchanged.
+    const float inv = ( sum > 1e-20f ) ? __fdividef( 1.0f, sum ) : 0.0f;
+    float val;
+    val = scalbnf( __fsqrt_rn( fmaxf( descr.x * inv, 0.0f ) ), d_consts.norm_multi );
+    descr.x = val;
+    val = scalbnf( __fsqrt_rn( fmaxf( descr.y * inv, 0.0f ) ), d_consts.norm_multi );
+    descr.y = val;
+    val = scalbnf( __fsqrt_rn( fmaxf( descr.z * inv, 0.0f ) ), d_consts.norm_multi );
+    descr.z = val;
+    val = scalbnf( __fsqrt_rn( fmaxf( descr.w * inv, 0.0f ) ), d_consts.norm_multi );
+    descr.w = val;
+#else
     float val;
     val = scalbnf( __fsqrt_rn( __fdividef( descr.x, sum ) ),
                    d_consts.norm_multi );
@@ -69,6 +102,7 @@ void NormalizeRootSift::normalize( const float* src_desc, float* dst_desc, bool
     val = scalbnf( __fsqrt_rn( __fdividef( descr.w, sum ) ),
                    d_consts.norm_multi );
     descr.w = val;
+#endif
 
     if( ! ignoreme ) {
         float4* out4 = (float4*)dst_desc;
diff --git a/src/popsift/s_desc_notile.cu b/src/popsift/s_desc_notile.cu
index 9ba8a927..f6fe7789 100644
--- a/src/popsift/s_desc_notile.cu
+++ b/src/popsift/s_desc_notile.cu
@@ -33,7 +33,7 @@ void ext_desc_notile_sub( const float x, const float y, const int level,
                          const float cos_t, const float sin_t, const float SBP,
                          const Extremum*     ext,
                          float* __restrict__ features,
-                         cudaTextureObject_t texLinear )
+                         LayeredReadTex      texLinear )
 {
     float dpt[8] = { 0 };
 
@@ -97,7 +97,7 @@ __global__
 // no -- __launch_bounds__(128) // 63/thread
 // no -- no launch bound // 64/thread/thread
 void ext_desc_notile( const int           octave,
-                      cudaTextureObject_t texLinear )
+                      LayeredReadTex      texLinear )
 {
     const int   num      = dct.ori_ct[octave];
 
@@ -149,7 +149,10 @@ bool start_ext_desc_notile( int octave, Octave& oct_obj )
     ext_desc_notile
         <<<grid,block,0,oct_obj.getStream()>>>
         ( octave,
-          oct_obj.getDataTexLinear( ).tex );
+          POPSIFT_LAYERED_SRC( oct_obj.getDataTexLinear( ).tex,
+                               oct_obj.getDataSurface( ),
+                               oct_obj.getWidth(),
+                               oct_obj.getHeight() ) );
     cudaDeviceSynchronize();
     cudaError_t err = cudaGetLastError( );
     POP_CUDA_FATAL_TEST(err, "cudaGetLastError failed: ");
diff --git a/src/popsift/s_desc_vlfeat.cu b/src/popsift/s_desc_vlfeat.cu
index 5fc4df13..6509d875 100644
--- a/src/popsift/s_desc_vlfeat.cu
+++ b/src/popsift/s_desc_vlfeat.cu
@@ -24,7 +24,7 @@ __device__ static inline
 void ext_desc_vlfeat_sub( const float         ang,
                           const Extremum*     ext,
                           float* __restrict__ features,
-                          cudaTextureObject_t layer_tex,
+                          LayeredReadTex      layer_tex,
                           const int           width,
                           const int           height )
 {
@@ -160,7 +160,7 @@ void ext_desc_vlfeat_sub( const float         ang,
     }
 }
 
-__global__ void ext_desc_vlfeat( int octave, cudaTextureObject_t layer_tex, int w, int h)
+__global__ void ext_desc_vlfeat( int octave, LayeredReadTex layer_tex, int w, int h)
 {
     const int   o_offset =  dct.ori_ps[octave] + blockIdx.x;
     Descriptor* desc     = &dbuf.desc           [o_offset];
@@ -201,7 +201,10 @@ bool start_ext_desc_vlfeat( const int octave, Octave& oct_obj )
     ext_desc_vlfeat
         <<<grid,block,shared_size,oct_obj.getStream()>>>
         ( octave,
-          oct_obj.getDataTexPoint( ),
+          POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ),
+                               oct_obj.getDataSurface( ),
+                               oct_obj.getWidth(),
+                               oct_obj.getHeight() ),
           oct_obj.getWidth(),
           oct_obj.getHeight() );
 
diff --git a/src/popsift/s_extrema.cu b/src/popsift/s_extrema.cu
index e1cf6738..43ef2bbb 100644
--- a/src/popsift/s_extrema.cu
+++ b/src/popsift/s_extrema.cu
@@ -22,24 +22,45 @@ template<int HEIGHT>
 __device__ static inline
 uint32_t extrema_count( unsigned int indicator, int* extrema_counter )
 {
-    uint32_t mask = popsift::ballot( indicator ); // bitfield of warps with results
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    // warpSize-generic (wave32 and wave64): one ballot per wavefront, the leader
+    // (lane 0) does a single atomicAdd of the set-bit count, and each thread's write
+    // slot is that base plus an exclusive prefix of the set bits below its lane. The
+    // lane is the block's linear thread index modulo warpSize, so on wave64 two
+    // blockDim=(32,HEIGHT) rows share a 64-lane wavefront and on wave32 each row is
+    // its own 32-lane wavefront; either way every row feeds the same octave counter.
+    // __ballot is 64-bit (unsigned long long); on wave32 the upper 32 bits are 0.
+    const int lane = ( threadIdx.y * blockDim.x + threadIdx.x ) % warpSize;
+    const unsigned long long ballot = __ballot( indicator );
+    // lane is in [0, warpSize-1] (<= 63), so 1ull<<lane is always in range.
+    const unsigned long long lane_mask = ( 1ull << lane ) - 1ull;
+
+    int write_index = 0;
+    if( lane == 0 ) {
+        write_index = atomicAdd( extrema_counter, __popcll( ballot ) );
+    }
+    // broadcast the base from the wavefront leader (lane 0) over the real wavefront
+    write_index = __shfl( write_index, 0, warpSize );
+    // exclusive prefix: count set bits strictly below this lane in the wavefront
+    write_index += __popcll( ballot & lane_mask );
+
+    return write_index;
+#else
+    // CUDA: a warp is 32 lanes; threadIdx.y selects the (independent) warp.
+    const int      lane = threadIdx.x;
+    const uint32_t mask = popsift::ballot( indicator );
 
-    int ct = __popc( mask );          // horizontal reduce
+    int ct = __popc( mask );
 
     int write_index;
-    if( threadIdx.x == 0 ) {
-        // atomicAdd returns the old value, we consider this the based
-        // index for this thread's write operation
+    if( lane == 0 ) {
         write_index = atomicAdd( extrema_counter, ct );
     }
-    // broadcast from thread 0 to all threads in warp
     write_index = popsift::shuffle( write_index, 0 );
-
-    // this thread's offset: count only bits below the bit of the own
-    // thread index; this provides the 0 result and every result up to ct
-    write_index += __popc( mask & ((1 << threadIdx.x) - 1) );
+    write_index += __popc( mask & ((1u << lane) - 1) );
 
     return write_index;
+#endif
 }
 
 __device__
@@ -54,7 +75,7 @@ inline void extremum_cmp( float val, float f, uint32_t& gt, uint32_t& lt, uint32
 
 __device__
 static
-inline bool is_extremum( cudaTextureObject_t obj,
+inline bool is_extremum( LayeredReadTex obj,
                          int x, int y, int z )
 {
     uint32_t gt = 0;
@@ -297,7 +318,7 @@ public:
 };
 
 template<int sift_mode>
-__device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog,
+__device__ inline bool find_extrema_in_dog_sub(LayeredReadTex dog,
                                                int debug_octave,
                                                int width,
                                                int height,
@@ -504,7 +525,7 @@ __device__ inline bool find_extrema_in_dog_sub(cudaTextureObject_t dog,
 
 template<int HEIGHT, int sift_mode>
 __global__
-void find_extrema_in_dog( cudaTextureObject_t dog,
+void find_extrema_in_dog( LayeredReadTex      dog,
                           int                 octave,
                           int                 width,
                           int                 height,
@@ -584,7 +605,10 @@ void Pyramid::find_extrema( const Config& conf )
         case Config::VLFeat :
                 find_extrema_in_dog<HEIGHT,Config::VLFeat>
                     <<<grid,block,0,oct_str>>>
-                    ( oct_obj.getDogTexturePoint( ),
+                    ( POPSIFT_LAYERED_SRC( oct_obj.getDogTexturePoint( ),
+                                           oct_obj.getDogSurface( ),
+                                           oct_obj.getWidth(),
+                                           oct_obj.getHeight() ),
                       octave,
                       cols,
                       rows,
@@ -599,7 +623,10 @@ void Pyramid::find_extrema( const Config& conf )
         case Config::OpenCV :
                 find_extrema_in_dog<HEIGHT,Config::OpenCV>
                     <<<grid,block,0,oct_str>>>
-                    ( oct_obj.getDogTexturePoint( ),
+                    ( POPSIFT_LAYERED_SRC( oct_obj.getDogTexturePoint( ),
+                                           oct_obj.getDogSurface( ),
+                                           oct_obj.getWidth(),
+                                           oct_obj.getHeight() ),
                       octave,
                       cols,
                       rows,
@@ -614,7 +641,10 @@ void Pyramid::find_extrema( const Config& conf )
         default :
                 find_extrema_in_dog<HEIGHT,Config::PopSift>
                     <<<grid,block,0,oct_str>>>
-                    ( oct_obj.getDogTexturePoint( ),
+                    ( POPSIFT_LAYERED_SRC( oct_obj.getDogTexturePoint( ),
+                                           oct_obj.getDogSurface( ),
+                                           oct_obj.getWidth(),
+                                           oct_obj.getHeight() ),
                       octave,
                       cols,
                       rows,
diff --git a/src/popsift/s_filtergrid.cu b/src/popsift/s_filtergrid.cu
index 9e35d8be..00d1b9eb 100644
--- a/src/popsift/s_filtergrid.cu
+++ b/src/popsift/s_filtergrid.cu
@@ -24,6 +24,15 @@
 #include <thrust/transform_scan.h>
 #include <thrust/version.h>
 
+// The stream-bound parallel execution policy lives in thrust::cuda on NVIDIA
+// and thrust::hip on rocThrust. Fully qualified from the global namespace to
+// avoid colliding with popsift::cuda (debug_macros.h) inside namespace popsift.
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+#define POPSIFT_THRUST_PAR ::thrust::hip::par
+#else
+#define POPSIFT_THRUST_PAR ::thrust::cuda::par
+#endif
+
 namespace popsift
 {
 
@@ -129,12 +138,12 @@ int Pyramid::extrema_filter_grid( const Config& conf, int ext_total )
             cudaStream_t oct_str = _octaves[o].getStream();
 
             // fill a continuous device array with octave of all initial extrema
-            thrust::fill(     thrust::cuda::par.on(oct_str),
+            thrust::fill(     POPSIFT_THRUST_PAR.on(oct_str),
                               octave_index.begin() + sum,
                               octave_index.begin() + sum + ocount,
                               o );
             // fill a continuous device array with index within octave of all initial extrema
-            thrust::sequence( thrust::cuda::par.on(oct_str),
+            thrust::sequence( POPSIFT_THRUST_PAR.on(oct_str),
                               iext_index.begin() + sum,
                               iext_index.begin() + sum + ocount );
             sum += ocount;
diff --git a/src/popsift/s_gradiant.h b/src/popsift/s_gradiant.h
index bfeadbd3..1a3b102f 100644
--- a/src/popsift/s_gradiant.h
+++ b/src/popsift/s_gradiant.h
@@ -57,7 +57,7 @@ void get_gradiant( float&              grad,
                    float&              theta,
                    const int           x,
                    const int           y,
-                   cudaTextureObject_t layer,
+                   LayeredReadTex      layer,
                    const int           level )
 {
     float dx = readTex( layer, x+1.0f, y, level )
@@ -77,7 +77,7 @@ void get_gradiant32( float&              grad,
                      float&              theta,
                      const int           x,
                      const int           y,
-                     cudaTextureObject_t layer,
+                     LayeredReadTex      layer,
                      const int           level )
 {
     const int idx = threadIdx.x;
@@ -106,7 +106,7 @@ void get_gradiant( float&              grad,
                    float               y,
                    float               cos_t,
                    float               sin_t,
-                   cudaTextureObject_t texLinear,
+                   LayeredReadTex      texLinear,
                    int                 level )
 {
     float dx = readTex( texLinear, x+cos_t, y+sin_t, level )
diff --git a/src/popsift/s_orientation.cu b/src/popsift/s_orientation.cu
index 1f75229f..30b9b53f 100644
--- a/src/popsift/s_orientation.cu
+++ b/src/popsift/s_orientation.cu
@@ -68,7 +68,7 @@ inline static float smoothe( const float* const src, const int bin )
 __global__
 void ori_par( const int           octave,
               const int           ext_ct_prefix_sum,
-              cudaTextureObject_t layer,
+              LayeredReadTex      layer,
               const int           w,
               const int           h )
 {
@@ -114,7 +114,11 @@ void ori_par( const int           octave,
     int loops = wx * hy;
 
     __syncthreads();
-    for( int i = threadIdx.x; popsift::any(i < loops); i += blockDim.x )
+    // ori_par runs as a 32-thread block (one logical warp); restrict the loop
+    // guard to this thread's own 32-lane group so a 64-lane wavefront does not
+    // poll a second, unrelated row. Group 0 == the only row here; on CUDA this
+    // is the plain whole-warp any().
+    for( int i = threadIdx.x; popsift::any_group(i < loops, 0); i += blockDim.x )
     {
         if( i < loops ) {
             int yy = i / wx + ymin;
@@ -189,7 +193,7 @@ void ori_par( const int           octave,
     // sub-cell refinement of the histogram cell index, yielding the angle
     // not necessary to initialize, every cell is computed
 
-    for( int bin = threadIdx.x; popsift::any( bin < ORI_NBINS ); bin += blockDim.x ) {
+    for( int bin = threadIdx.x; popsift::any_group( bin < ORI_NBINS, 0 ); bin += blockDim.x ) {
         const int prev = bin == 0 ? ORI_NBINS-1 : bin-1;
         const int next = bin == ORI_NBINS-1 ? 0 : bin+1;
 
@@ -240,7 +244,7 @@ void ori_par( const int           octave,
         }
     }
 
-    int angles = __popc( popsift::ballot( written ) );
+    int angles = __popc( popsift::ballot_group( written, 0 ) );
     if( threadIdx.x == 0 ) {
         ext->xpos    = iext->xpos;
         ext->ypos    = iext->ypos;
@@ -407,7 +411,10 @@ void Pyramid::orientation( const Config& conf )
                 <<<grid,block,4*64*sizeof(float),oct_str>>>
                 ( octave,
                   hct.ext_ps[octave],
-                  oct_obj.getDataTexPoint( ),
+                  POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ),
+                                       oct_obj.getDataSurface( ),
+                                       oct_obj.getWidth( ),
+                                       oct_obj.getHeight( ) ),
                   oct_obj.getWidth( ),
                   oct_obj.getHeight( ) );
             POP_SYNC_CHK;
diff --git a/src/popsift/s_pyramid_build.cu b/src/popsift/s_pyramid_build.cu
index 7ecf6ab5..629f633d 100755
--- a/src/popsift/s_pyramid_build.cu
+++ b/src/popsift/s_pyramid_build.cu
@@ -30,7 +30,7 @@ namespace popsift {
 namespace gauss {
 
 __global__
-void get_by_2_pick_every_second( cudaTextureObject_t src_data,
+void get_by_2_pick_every_second( LayeredReadTex      src_data,
                                  const int           src_w,
                                  const int           src_h,
                                  const int           src_level,
@@ -54,7 +54,7 @@ void get_by_2_pick_every_second( cudaTextureObject_t src_data,
 
 
 __global__
-void make_dog( cudaTextureObject_t src_data,
+void make_dog( LayeredReadTex      src_data,
                cudaSurfaceObject_t dog_data,
                const int           w,
                const int           h,
@@ -198,7 +198,10 @@ inline void Pyramid::downscale_from_prev_octave( int octave, cudaStream_t stream
 
     gauss::get_by_2_pick_every_second
         <<<h_grid,h_block,0,stream>>>
-        ( prev_oct_obj.getDataTexPoint( ),
+        ( POPSIFT_LAYERED_SRC( prev_oct_obj.getDataTexPoint( ),
+                               prev_oct_obj.getDataSurface( ),
+                               prev_oct_obj.getWidth(),
+                               prev_oct_obj.getHeight() ),
           prev_oct_obj.getWidth(),
           prev_oct_obj.getHeight(),
           _levels-PREV_LEVEL,
@@ -228,7 +231,10 @@ inline void Pyramid::horiz_from_prev_level( int octave, int level, cudaStream_t
 
             gauss::absoluteSourceInterpolated::horiz
                 <<<grid,block,0,stream>>>
-                ( oct_obj.getDataTexLinear( ).tex,
+                ( POPSIFT_LAYERED_SRC( oct_obj.getDataTexLinear( ).tex,
+                                       oct_obj.getDataSurface( ),
+                                       oct_obj.getWidth(),
+                                       oct_obj.getHeight() ),
                   oct_obj.getIntermediateSurface( ),
                   level );
         }
@@ -244,7 +250,10 @@ inline void Pyramid::horiz_from_prev_level( int octave, int level, cudaStream_t
 
             gauss::absoluteSource::horiz
                 <<<grid,block,0,stream>>>
-                ( oct_obj.getDataTexPoint( ),
+                ( POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ),
+                                       oct_obj.getDataSurface( ),
+                                       oct_obj.getWidth(),
+                                       oct_obj.getHeight() ),
                   oct_obj.getIntermediateSurface( ),
                   level );
         }
@@ -282,7 +291,9 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea
 
             gauss::absoluteSourceInterpolated::vert
                 <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexLinear( ).tex,
+                ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexLinear( ).tex,
+                                       oct_obj.getIntermediateSurface( ),
+                                       width, height ),
                   oct_obj.getDataSurface( ),
                   level );
         }
@@ -296,7 +307,9 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea
 
             gauss::absoluteSourceInterpolated::vert_abs0
                 <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexLinear( ).tex,
+                ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexLinear( ).tex,
+                                       oct_obj.getIntermediateSurface( ),
+                                       width, height ),
                   oct_obj.getDataSurface( ),
                   level );
         }
@@ -310,7 +323,9 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea
 
             gauss::absoluteSource::vert
                 <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexPoint( ),
+                ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexPoint( ),
+                                       oct_obj.getIntermediateSurface( ),
+                                       width, height ),
                   oct_obj.getDataSurface( ),
                   level );
         }
@@ -324,7 +339,9 @@ inline void Pyramid::vert_from_interm( int octave, int level, cudaStream_t strea
 
             gauss::absoluteSource::vert_abs0
                 <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexPoint( ),
+                ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexPoint( ),
+                                       oct_obj.getIntermediateSurface( ),
+                                       width, height ),
                   oct_obj.getDataSurface( ),
                   level );
         }
@@ -360,7 +377,9 @@ inline void Pyramid::vert_all_from_interm( int octave, int start_level, int max_
 
             gauss::absoluteSourceInterpolated::vert_all_abs0
                 <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexLinear( ).tex,
+                ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexLinear( ).tex,
+                                       oct_obj.getIntermediateSurface( ),
+                                       width, height ),
                   oct_obj.getDataSurface( ),
                   start_level,
                   max_level );
@@ -375,7 +394,9 @@ inline void Pyramid::vert_all_from_interm( int octave, int start_level, int max_
 
             gauss::absoluteSource::vert_all_abs0
                 <<<grid,block,0,stream>>>
-                ( oct_obj.getIntermDataTexPoint( ),
+                ( POPSIFT_LAYERED_SRC( oct_obj.getIntermDataTexPoint( ),
+                                       oct_obj.getIntermediateSurface( ),
+                                       width, height ),
                   oct_obj.getDataSurface( ),
                   start_level,
                   max_level );
@@ -408,7 +429,10 @@ inline void Pyramid::dogs_from_blurred( int octave, int max_level, cudaStream_t
 
     gauss::make_dog
         <<<grid,block,0,stream>>>
-        ( oct_obj.getDataTexPoint( ),
+        ( POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ),
+                               oct_obj.getDataSurface( ),
+                               oct_obj.getWidth(),
+                               oct_obj.getHeight() ),
           oct_obj.getDogSurface( ),
           oct_obj.getWidth(),
           oct_obj.getHeight(),
diff --git a/src/popsift/s_pyramid_build_aa.cu b/src/popsift/s_pyramid_build_aa.cu
index c026a8b7..1a6edff3 100755
--- a/src/popsift/s_pyramid_build_aa.cu
+++ b/src/popsift/s_pyramid_build_aa.cu
@@ -14,7 +14,7 @@ namespace popsift {
 namespace gauss {
 namespace absoluteSource {
 
-__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
+__global__ void horiz(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    src_level = dst_level - 1;
     const int    span      =  d_gauss.inc.span[dst_level];
@@ -33,6 +33,26 @@ __global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t
     g    = filter[span];
     out += ( A + B ) * g;
 
+    // Horizontal Gauss tap exchange via warp shuffles. The block is (32,blockDim.y):
+    // each threadIdx.y row is an independent 32-lane group convolving one image
+    // row. On a 64-lane wavefront two rows share a wavefront, so the shuffles must
+    // be confined to a width-32 sub-group (threadIdx.x is already the in-row lane
+    // id since blockDim.x==32) or a lane would pull a neighbour from the wrong
+    // image row and corrupt the pyramid. CUDA: width 32 is the whole warp.
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    int shiftval = 0;
+    for( int offset=span-1; offset>0; offset-- ) {
+        shiftval += 1;
+        const float D1 = popsift::shuffle_down( A, shiftval, 32 );
+        const float D2 = popsift::shuffle_up  ( C, span - shiftval, 32 );
+        const float D  = threadIdx.x < (32 - shiftval) ? D1 : D2;
+        const float E1 = popsift::shuffle_up  ( B, shiftval, 32 );
+        const float E2 = popsift::shuffle_down( C, span - shiftval, 32 );
+        const float E  = threadIdx.x > shiftval        ? E1 : E2;
+        g = filter[offset];
+        out += ( D + E ) * g;
+    }
+#else
     int shiftval = 0;
     for( int offset=span-1; offset>0; offset-- ) {
         shiftval += 1;
@@ -45,11 +65,12 @@ __global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t
         g = filter[offset];
         out += ( D + E ) * g;
     }
+#endif
 
     surf2DLayeredwrite( out, dst_data, off_x*4, off_y, dst_level, cudaBoundaryModeZero );
 }
 
-__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
+__global__ void vert(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    span   =  d_gauss.inc.span[dst_level];
     const float* filter = &d_gauss.inc.filter[dst_level*GAUSS_ALIGN];
@@ -85,7 +106,7 @@ __global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t
     surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero );
 }
 
-__global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
+__global__ void vert_abs0(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    span   =  d_gauss.abs_o0.span[dst_level];
     const float* filter = &d_gauss.abs_o0.filter[dst_level*GAUSS_ALIGN];
@@ -121,7 +142,7 @@ __global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObje
     surf2DLayeredwrite( out, dst_data, idx*4, idy, dst_level, cudaBoundaryModeZero );
 }
 
-__global__ void vert_all_abs0(cudaTextureObject_t src_point_texture,
+__global__ void vert_all_abs0(LayeredReadTex src_point_texture,
                               cudaSurfaceObject_t dst_data,
                               int start_level,
                               int max_level)
diff --git a/src/popsift/s_pyramid_build_aa.h b/src/popsift/s_pyramid_build_aa.h
index 4d3423cf..85664f20 100755
--- a/src/popsift/s_pyramid_build_aa.h
+++ b/src/popsift/s_pyramid_build_aa.h
@@ -5,19 +5,20 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+#include "common/assist.h"
 #include "common/plane_2d.h"
 
 namespace popsift {
 namespace gauss {
 namespace absoluteSource {
 
-__global__ void horiz(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
+__global__ void horiz(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__ void vert(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
+__global__ void vert(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__ void vert_abs0(cudaTextureObject_t src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
+__global__ void vert_abs0(LayeredReadTex src_point_texture, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__ void vert_all_abs0(cudaTextureObject_t src_point_texture,
+__global__ void vert_all_abs0(LayeredReadTex src_point_texture,
                               cudaSurfaceObject_t dst_data,
                               int start_level,
                               int max_level);
diff --git a/src/popsift/s_pyramid_build_ai.cu b/src/popsift/s_pyramid_build_ai.cu
index c16d636e..b35ff8d2 100755
--- a/src/popsift/s_pyramid_build_ai.cu
+++ b/src/popsift/s_pyramid_build_ai.cu
@@ -14,7 +14,7 @@ namespace popsift {
 namespace gauss {
 namespace absoluteSourceInterpolated {
 
-__global__ void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
+__global__ void horiz(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    src_level = dst_level - 1;
     const int    span      =  d_gauss.inc.i_span[dst_level];
@@ -40,7 +40,7 @@ __global__ void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t ds
     surf2DLayeredwrite( out, dst_data, off_x*4, blockIdx.y, dst_level, cudaBoundaryModeZero );
 }
 
-__global__ void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
+__global__ void vert(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    span   =  d_gauss.inc.i_span[dst_level];
     const float* filter = &d_gauss.inc.i_filter[dst_level*GAUSS_ALIGN];
@@ -68,7 +68,7 @@ __global__ void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst
     surf2DLayeredwrite( out, dst_data, (block_x+idx)*4, block_y+idy, dst_level, cudaBoundaryModeZero );
 }
 
-__global__ void vert_abs0(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
+__global__ void vert_abs0(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level)
 {
     const int    span   =  d_gauss.abs_o0.i_span[dst_level];
     const float* filter = &d_gauss.abs_o0.i_filter[dst_level*GAUSS_ALIGN];
@@ -96,7 +96,7 @@ __global__ void vert_abs0(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_
     surf2DLayeredwrite( out, dst_data, (block_x+idx)*4, block_y+idy, dst_level, cudaBoundaryModeZero );
 }
 
-__global__ void vert_all_abs0(cudaTextureObject_t src_linear_tex,
+__global__ void vert_all_abs0(LayeredReadTex src_linear_tex,
                               cudaSurfaceObject_t dst_data,
                               int start_level,
                               int max_level)
diff --git a/src/popsift/s_pyramid_build_ai.h b/src/popsift/s_pyramid_build_ai.h
index d3431fe7..7985a787 100755
--- a/src/popsift/s_pyramid_build_ai.h
+++ b/src/popsift/s_pyramid_build_ai.h
@@ -5,19 +5,20 @@
  * License, v. 2.0. If a copy of the MPL was not distributed with this
  * file, You can obtain one at http://mozilla.org/MPL/2.0/.
  */
+#include "common/assist.h"
 #include "common/plane_2d.h"
 
 namespace popsift {
 namespace gauss {
 namespace absoluteSourceInterpolated {
 
-__global__ void horiz(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
+__global__ void horiz(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__ void vert(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
+__global__ void vert(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__ void vert_abs0(cudaTextureObject_t src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
+__global__ void vert_abs0(LayeredReadTex src_linear_tex, cudaSurfaceObject_t dst_data, int dst_level);
 
-__global__ void vert_all_abs0(cudaTextureObject_t src_linear_tex,
+__global__ void vert_all_abs0(LayeredReadTex src_linear_tex,
                               cudaSurfaceObject_t dst_data,
                               int start_level,
                               int max_level);
diff --git a/src/popsift/s_pyramid_fixed.cu b/src/popsift/s_pyramid_fixed.cu
index 9e3d52aa..c9af8be2 100755
--- a/src/popsift/s_pyramid_fixed.cu
+++ b/src/popsift/s_pyramid_fixed.cu
@@ -30,7 +30,20 @@ inline float octave_fixed_horiz( float fval, const float* filter )
      * input  fval of thread N is extracted from image index N-4
      * output fval of thread N should be filtered sum from N-4 to N+4
      */
+    // Horizontal fixed-span Gauss via warp shuffles. block.x==32 and the block
+    // packs multiple rows (threadIdx.y/z); on a 64-lane wavefront confine the
+    // shuffles to a width-32 sub-group so a lane does not pull a neighbour from
+    // another row. CUDA: width 32 is the whole warp, unchanged.
     float out = fval * filter[0];
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    #pragma unroll
+    for( int i=1; i<=SHIFT; i++ ) {
+        float val  = popsift::shuffle_up( fval, i, 32 ) + popsift::shuffle_down( fval, i, 32 );
+        out += val * filter[i];
+    }
+
+    fval = popsift::shuffle_down( out, SHIFT, 32 );
+#else
     #pragma unroll
     for( int i=1; i<=SHIFT; i++ ) {
         float val  = popsift::shuffle_up( fval, i ) + popsift::shuffle_down( fval, i );
@@ -38,6 +51,7 @@ inline float octave_fixed_horiz( float fval, const float* filter )
     }
 
     fval = popsift::shuffle_down( out, SHIFT );
+#endif
 
     return fval;
 }
@@ -47,7 +61,7 @@ namespace absoluteTexAddress {
 
 template<int SHIFT>
 __device__
-inline float octave_fixed_vert( cudaTextureObject_t src_data, int idx, int idy, int level, const float* filter )
+inline float octave_fixed_vert( LayeredReadTex src_data, int idx, int idy, int level, const float* filter )
 {
     /* Input thread N takes as input the (idx,idy) position of the pixel that it
      * will eventually write (The 2*SHIFT rightmost threads will not write anything).
@@ -68,7 +82,7 @@ inline float octave_fixed_vert( cudaTextureObject_t src_data, int idx, int idy,
 
 template<int SHIFT, int WIDTH, int HEIGHT, int LEVELS>
 __global__
-void octave_fixed( cudaTextureObject_t src_data,
+void octave_fixed( LayeredReadTex      src_data,
                    cudaSurfaceObject_t dst_data,
                    const int           w,
                    const int           h,
@@ -258,7 +272,10 @@ inline void make_octave_sub( const Config& conf, ImageBase* base, Octave& oct_ob
         gauss::fixedSpan::absoluteTexAddress::octave_fixed
             <SHIFT,w_conf,h_conf,l_conf>
             <<<grid,block,0,stream>>>
-            ( oct_obj.getDataTexPoint( ),
+            ( POPSIFT_LAYERED_SRC( oct_obj.getDataTexPoint( ),
+                                   oct_obj.getDataSurface( ),
+                                   oct_obj.getWidth(),
+                                   oct_obj.getHeight() ),
               oct_obj.getDataSurface( ),
               oct_obj.getWidth(),
               oct_obj.getHeight(),
diff --git a/src/popsift/sift_octave.cu b/src/popsift/sift_octave.cu
index 4e05780d..4ec5c908 100755
--- a/src/popsift/sift_octave.cu
+++ b/src/popsift/sift_octave.cu
@@ -221,10 +221,21 @@ void Octave::alloc_data_planes()
     _data_ext.height = _h;
     _data_ext.depth  = _levels;
 
+    // Observed on gfx90a (ROCm 7.2.1): layered images are incoherent across kernel
+    // launches (the layer dimension collapses to a single layer on read); filed as
+    // ROCm/clr#275 (the partial fix ROCm/rocm-systems#6683 covers only surf2DLayered).
+    // A non-layered 3D array with surf3D/tex3D access is coherent, so drop
+    // cudaArrayLayered on HIP. The blur levels are addressed by the z coordinate
+    // instead of the layer index. See cuda_to_hip.h and common/assist.h. CUDA keeps
+    // a real layered array.
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    err = cudaMalloc3DArray( &_data, &_data_desc, _data_ext, cudaArraySurfaceLoadStore );
+#else
     err = cudaMalloc3DArray( &_data,
                              &_data_desc,
                              _data_ext,
                              cudaArrayLayered | cudaArraySurfaceLoadStore);
+#endif
     POP_CUDA_FATAL_TEST(err, "Could not allocate Blur level array: ");
 }
 
@@ -266,7 +277,14 @@ void Octave::alloc_data_tex()
     tex_desc.addressMode[1]   = cudaAddressModeClamp;
     tex_desc.addressMode[2]   = cudaAddressModeClamp;
     tex_desc.readMode         = cudaReadModeElementType; // read as float
-    tex_desc.filterMode       = cudaFilterModeLinear; // no interpolation
+    tex_desc.filterMode       = cudaFilterModeLinear; // hardware bilinear (CUDA)
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    // Observed on gfx90a (ROCm 7.2.1): hipCreateTextureObject rejects hardware
+    // linear filtering on element-read float arrays ("operation not supported").
+    // Create the texture with point filtering and do bilinear interpolation in
+    // software in readTex(). Empirical on this device/ROCm; re-verify on RDNA.
+    tex_desc.filterMode       = cudaFilterModePoint;
+#endif
 
     err = cudaCreateTextureObject( &_data_tex_linear.tex, &res_desc, &tex_desc, nullptr );
     POP_CUDA_FATAL_TEST(err, "Could not create Blur data point texture: ");
@@ -300,10 +318,15 @@ void Octave::alloc_interm_array()
     _intm_ext.height = _h;
     _intm_ext.depth  = _levels;
 
+    // Non-layered 3D array on HIP (see alloc_data_planes()).
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    err = cudaMalloc3DArray( &_intm, &_intm_desc, _intm_ext, cudaArraySurfaceLoadStore );
+#else
     err = cudaMalloc3DArray( &_intm,
                              &_intm_desc,
                              _intm_ext,
                              cudaArrayLayered | cudaArraySurfaceLoadStore);
+#endif
     POP_CUDA_FATAL_TEST(err, "Could not allocate Intermediate layered array: ");
 }
 
@@ -339,7 +362,12 @@ void Octave::alloc_interm_tex()
     err = cudaCreateTextureObject( &_intm_tex_point, &res_desc, &tex_desc, nullptr );
     POP_CUDA_FATAL_TEST(err, "Could not create Blur intermediate point texture: ");
 
-    tex_desc.filterMode       = cudaFilterModeLinear; // no interpolation
+    tex_desc.filterMode       = cudaFilterModeLinear; // hardware bilinear (CUDA)
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+    // See alloc_data_tex(): HIP rejects linear filtering on element-read float
+    // arrays, so use point filtering here and interpolate in software in readTex().
+    tex_desc.filterMode       = cudaFilterModePoint;
+#endif
 
     err = cudaCreateTextureObject( &_intm_tex_linear.tex, &res_desc, &tex_desc, nullptr );
     POP_CUDA_FATAL_TEST(err, "Could not create Blur intermediate point texture: ");
@@ -373,10 +401,15 @@ void Octave::alloc_dog_array()
         _dog_3d_ext.height = _h;
         _dog_3d_ext.depth = _levels - 1;
 
+        // Non-layered 3D array on HIP (see alloc_data_planes()).
+#if defined(USE_HIP) || defined(__HIP_PLATFORM_AMD__)
+        err = cudaMalloc3DArray(&_dog_3d, &_dog_3d_desc, _dog_3d_ext, cudaArraySurfaceLoadStore);
+#else
         err = cudaMalloc3DArray(&_dog_3d,
             &_dog_3d_desc,
             _dog_3d_ext,
             cudaArrayLayered | cudaArraySurfaceLoadStore);
+#endif
         POP_CUDA_FATAL_TEST(err, "Could not allocate 3D DoG array: ");
 }
 
diff --git a/src/popsift/sift_octave.h b/src/popsift/sift_octave.h
index 96ea7635..ead94c02 100755
--- a/src/popsift/sift_octave.h
+++ b/src/popsift/sift_octave.h
@@ -19,7 +19,11 @@ namespace popsift {
 
 struct LinearTexture
 {
-    cudaSurfaceObject_t tex;
+    // Holds a texture object (assigned from cudaCreateTextureObject and consumed
+    // by tex2DLayered). On CUDA texture and surface handles are both unsigned
+    // long long so the original cudaSurfaceObject_t typing compiled; HIP uses
+    // distinct pointer types, so the correct texture type must be used here.
+    cudaTextureObject_t tex;
 };
 
 class Octave