Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,5 @@ oxford

# Downloaded archives for tests.
*.tgz

build-hip/
170 changes: 103 additions & 67 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,26 +1,38 @@
# First-order language CUDA requires at least CMake 3.18
cmake_minimum_required(VERSION 3.24)

# Default seletion of CUDA Compute Capabilities.
# This must be called before project() or cmake sets it to the oldest non-deprecated CC
# "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson.
if(EXISTS "/etc/nv_tegra_release")
# The CC list for Tegras and Jetson will require manual updates
set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87"
CACHE
STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
else()
# The CC list for discrete GPUs will require CMake updates
set(CMAKE_CUDA_ARCHITECTURES "all-major"
CACHE
STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
# USE_HIP routes the .cu translation units through the HIP toolchain for AMD
# GPUs. Everything else keeps its plain
# CUDA spelling; cuda_to_hip.h (force-included for HIP in src/CMakeLists.txt)
# does the symbol translation. The NVIDIA path (USE_HIP=OFF) is unchanged.
option(USE_HIP "Build with HIP for AMD GPUs instead of CUDA" OFF)

if(NOT USE_HIP)
# Default seletion of CUDA Compute Capabilities.
# This must be called before project() or cmake sets it to the oldest non-deprecated CC
# "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson.
if(EXISTS "/etc/nv_tegra_release")
# The CC list for Tegras and Jetson will require manual updates
set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87"
CACHE
STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
else()
# The CC list for discrete GPUs will require CMake updates
set(CMAKE_CUDA_ARCHITECTURES "all-major"
CACHE
STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
endif()
endif()

project(PopSift VERSION 0.10.1 LANGUAGES CXX CUDA)
if(USE_HIP)
project(PopSift VERSION 0.10.1 LANGUAGES CXX HIP)
else()
project(PopSift VERSION 0.10.1 LANGUAGES CXX CUDA)

# Policy to support CUDA as a first-order language for CMake.
# Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html
cmake_policy(SET CMP0104 NEW)
# Policy to support CUDA as a first-order language for CMake.
# Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html
cmake_policy(SET CMP0104 NEW)
endif()

# Set build path as a folder named as the platform (linux, windows, darwin...) plus the processor type
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
Expand Down Expand Up @@ -75,70 +87,94 @@ if(BUILD_SHARED_LIBS)
# Auto-build dll exports on Windows
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)

set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
if(NOT USE_HIP)
set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
endif()
else()
message(STATUS "BUILD_SHARED_LIBS OFF")

set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
if(NOT USE_HIP)
set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
endif()
endif()

# Require threads because of std::thread.
find_package(Threads REQUIRED)

###################
# CUDA
###################
include(CheckLanguage)
check_language(CUDA)

# Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path"
# target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart)
find_package(CUDAToolkit)

message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}")
set(CUDA_VERSION ${CUDAToolkit_VERSION})

if(PopSift_ERRCHK_AFTER_KERNEL)
message(STATUS "Synchronizing and checking errors after every kernel call")
list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL")
endif()

# This may not be required any more.
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

# default stream per-thread implies that each host thread has one non-synchronizing 0-stream
# currently, the code requires legacy mode
list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy")
# set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--default-stream;per-thread")
if(USE_HIP)
###################
# HIP (AMD)
###################
# rocThrust supplies the thrust::cuda::par compatibility namespace, so the
# Thrust code in s_filtergrid.cu compiles unchanged on ROCm.
find_package(rocthrust REQUIRED CONFIG)

set(PopSift_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_HIP_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_HIP_STANDARD_REQUIRED ON)
set(CMAKE_HIP_SEPARABLE_COMPILATION ON)

# HIP has __shfl_down etc. unconditionally; assist.h selects the _sync
# spelling on this flag and cuda_to_hip.h maps _sync to the mask-free builtin.
set(PopSift_HAVE_SHFL_DOWN_SYNC 1)
else()
###################
# CUDA
###################
include(CheckLanguage)
check_language(CUDA)

# Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path"
# target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart)
find_package(CUDAToolkit)

message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}")
set(CUDA_VERSION ${CUDAToolkit_VERSION})

if(PopSift_ERRCHK_AFTER_KERNEL)
message(STATUS "Synchronizing and checking errors after every kernel call")
list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL")
endif()

if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
if(PopSift_NVCC_WARNINGS)
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills")
# This may not be required any more.
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

# default stream per-thread implies that each host thread has one non-synchronizing 0-stream
# currently, the code requires legacy mode
list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy")
# set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--default-stream;per-thread")

if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
if(PopSift_NVCC_WARNINGS)
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills")
endif()
endif()
endif()

set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11
if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
set(PopSift_CXX_STANDARD 11)
endif()
set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11
if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
set(PopSift_CXX_STANDARD 11)
endif()

if(NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}")
list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}")
endif()
set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
if(NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}")
list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}")
endif()
set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CUDA_STANDARD_REQUIRED ON)


if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
set(PopSift_HAVE_SHFL_DOWN_SYNC 1)
else()
set(PopSift_HAVE_SHFL_DOWN_SYNC 0)
if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
set(PopSift_HAVE_SHFL_DOWN_SYNC 1)
else()
set(PopSift_HAVE_SHFL_DOWN_SYNC 0)
endif()
endif()

if(NOT PopSift_USE_GRID_FILTER)
Expand Down
38 changes: 38 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ PopSift compiles and works with NVidia cards of compute capability >= 3.0 (inclu

CUDA SDK 11 does no longer support compute capability 3.0. 3.5 is still supported with deprecation warnings.

PopSift also runs on AMD GPUs through ROCm/HIP (validated on gfx90a and gfx1100, and on Windows gfx1151); see the `USE_HIP` build option below.

## Dependencies

PopSift depends on:
Expand Down Expand Up @@ -46,6 +48,42 @@ Some build options are available:

* `BUILD_SHARED_LIBS` (default: `ON`) controls the type of library to build (`ON` for shared libraries, `OFF` for static)

* `USE_HIP` (default: `OFF`) builds for AMD GPUs through ROCm/HIP instead of CUDA (requires a ROCm installation); see "Building for AMD GPUs (ROCm/HIP)" below.

### Building for AMD GPUs (ROCm/HIP)

With a ROCm installation, configure with `-DUSE_HIP=ON` to build the GPU code through HIP instead of CUDA. The CUDA build path is unchanged when `USE_HIP=OFF` (the default).

```shell
cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a \
-DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++ \
-DCMAKE_PREFIX_PATH=/opt/rocm
make
```

Point `-DCMAKE_PREFIX_PATH` at the ROCm install prefix (`/opt/rocm` for a default install) so CMake can locate the `hip` and `rocThrust` packages when ROCm is not already on the search path.

Set the target architecture with `-DCMAKE_HIP_ARCHITECTURES` (e.g. `gfx90a`, `gfx1100`); pass a semicolon-separated list such as `"gfx90a;gfx1100"` to emit a fat binary.

#### Windows (gfx1151 / TheRock ROCm)

Tested with the ROCm wheels from [TheRock](https://github.com/ROCm/TheRock) (see its README for installing the Windows wheels). The steps below assume `<rocm>` is that install prefix and that `clang` from it is on `PATH`:

```shell
cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1151 ^
-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang -DCMAKE_HIP_COMPILER=clang ^
-DCMAKE_PREFIX_PATH=<rocm> -DPopSift_BUILD_EXAMPLES=OFF ^
-DCMAKE_LINKER_TYPE=LLDFIX ^
-DCMAKE_C_USING_LINKER_LLDFIX=-fuse-ld=lld ^
-DCMAKE_CXX_USING_LINKER_LLDFIX=-fuse-ld=lld ^
-DCMAKE_HIP_USING_LINKER_LLDFIX=-fuse-ld=lld
cmake --build . --config Release
```

The `CMAKE_LINKER_TYPE=LLDFIX` block works around the `-fgpu-rdc` device link: CMake otherwise emits `-fuse-ld=lld-link`, which AMD clang rejects under `--hip-link`. At runtime, copy a matching `amdhip64` and `amd_comgr` from `<rocm>` next to the executable (a System32 Adrenalin-driver runtime can be device-library mismatched).

On the gfx1151 APU, `maxTexture2DLayered` is smaller than on gfx90a, so the default upscale can overflow for large images; use native-resolution downsampling (`Config::setDownsampling(0)`) or cap the input size.

## Usage

The main artifact created is `libpopsift`.
Expand Down
4 changes: 4 additions & 0 deletions doc/sphinx/source/install/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ Software

The core library depends only on Cuda >= 7.0

PopSift also builds for AMD GPUs through ROCm/HIP instead of CUDA, requiring ROCm 7.2 or newer; see the ``USE_HIP`` CMake option below.

The library includes a few sample applications that show how to use the library.
They require

Expand Down Expand Up @@ -173,6 +175,8 @@ CMake configuration can be controlled by changing the values of the following va

* :code:`PopSift_BUILD_DOC:BOOL=OFF` to enable/disable building this documentation and the Doxygen one.

* :code:`USE_HIP:BOOL=OFF` to build for AMD GPUs with ROCm/HIP instead of CUDA. Set the target GPU architecture with :code:`CMAKE_HIP_ARCHITECTURES` (for example :code:`gfx90a` for CDNA2 / MI200, or :code:`gfx1100` for RDNA3). Requires a ROCm installation (7.2 or newer) providing HIP.

For example, if you do not want to build the applications, you have to pass :code:`-DPopSift_BUILD_EXAMPLES:BOOL=OFF` and so on.


Expand Down
61 changes: 59 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,71 @@ add_library(popsift
popsift/common/vec_macros.h
popsift/common/clamp.h)

if(USE_HIP)
# Collect the .cu translation units so the HIP build can mark them LANGUAGE HIP.
# Under USE_HIP, .cu would otherwise default to CUDA (a language not enabled in
# the HIP build), so the language must be set explicitly. This is only needed on
# the HIP path, so the collection lives inside the guard.
get_target_property(_popsift_sources popsift SOURCES)
set(_popsift_cu_sources "")
foreach(_src ${_popsift_sources})
if(_src MATCHES "\\.cu$")
list(APPEND _popsift_cu_sources "${_src}")
endif()
endforeach()
set_source_files_properties(${_popsift_cu_sources} PROPERTIES LANGUAGE HIP)
endif()

target_link_libraries(popsift
PUBLIC
CUDA::cudart
Threads::Threads)
if(USE_HIP)
# rocThrust transitively links hip::device, whose INTERFACE_COMPILE_OPTIONS inject
# "-x hip" for COMPILE_LANGUAGE:CXX. Keep that device-compile interface PRIVATE so
# it does not leak "-x hip" onto plain-C++ consumers of popsift (the example apps).
# Consumers still need the HIP runtime to link and its headers to satisfy the
# public <cuda_runtime.h> include, so expose hip::host PUBLIC (link + headers, no
# "-x hip"). The library .cu sources are already LANGUAGE HIP, so they get the HIP
# compiler regardless of the link visibility.
target_link_libraries(popsift PRIVATE roc::rocthrust)
target_link_libraries(popsift PUBLIC hip::host)
else()
target_link_libraries(popsift PUBLIC CUDA::cudart)
endif()

set_target_properties(popsift PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(popsift PROPERTIES DEBUG_POSTFIX "d")
set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
if(USE_HIP)
# CMAKE_HIP_ARCHITECTURES is already resolved by project(... LANGUAGES CXX HIP)
# in the top-level CMakeLists: it auto-detects the host GPU arch, honors an
# explicit -DCMAKE_HIP_ARCHITECTURES, and errors on a host with no GPU. Pass
# -DCMAKE_HIP_ARCHITECTURES to override.
set_target_properties(popsift PROPERTIES
HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
# USE_HIP selects the HIP branch of cuda_to_hip.h. __HIP_PLATFORM_AMD__ is
# only defined once the HIP runtime header is included, so it cannot gate the
# include of that very header; the explicit define breaks the cycle.
target_compile_definitions(popsift PRIVATE USE_HIP)
# Force-include the compat header into every HIP translation unit so no
# per-file #include edits are needed (minimal footprint).
# -fgpu-rdc keeps device code relocatable so the cross-TU __device__/__constant__
# globals (d_consts, d_gauss, dct/dbuf/dobuf -- declared extern in headers,
# defined in one .cu, used in many) are resolved by the final --hip-link step.
# Without it each .o is device-linked standalone and those references are
# undefined. The matching -fgpu-rdc on the link rule (below) performs the
# device link.
target_compile_options(popsift PRIVATE
$<$<COMPILE_LANGUAGE:HIP>:-include${CMAKE_CURRENT_SOURCE_DIR}/popsift/cuda_to_hip.h>
$<$<COMPILE_LANGUAGE:HIP>:-fgpu-rdc>)
target_link_options(popsift PRIVATE -fgpu-rdc --hip-link)
# hip_compat/ supplies a <cuda_runtime.h> shim for the sources that include it
# directly; HIP build only. PRIVATE so the public popsift.h <cuda_runtime.h>
# include still resolves to the real CUDA header for downstream NVIDIA users.
target_include_directories(popsift PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/popsift/hip_compat)
else()
set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
endif()
if(MSVC)
target_compile_options(popsift PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/Zc:preprocessor>)
endif()
Expand Down
31 changes: 29 additions & 2 deletions src/application/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,24 @@ endif()

add_executable(popsift-demo main.cpp pgmread.cpp pgmread.h)

set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11)
if(USE_HIP)
# main.cpp/pgmread.cpp have NO device code; they consume popsift's host API only.
# popsift now links the HIP device-compile interface (roc::rocthrust -> hip::device,
# which injects "-x hip") PRIVATE and exposes only hip::host PUBLIC, so these
# sources compile as plain C++ (no "-x hip") and just link the HIP runtime. They
# still need the hip_compat <cuda_runtime.h> shim and the cuda_to_hip.h force-include
# because popsift's public headers (device_prop.h) #include <cuda_runtime.h>; the
# library keeps those PRIVATE, so the consumer adds them explicitly here. USE_HIP
# selects the HIP branch of cuda_to_hip.h. C++17 for the compat header.
set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 17)
target_compile_definitions(popsift-demo PRIVATE USE_HIP)
target_include_directories(popsift-demo PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../popsift/hip_compat)
target_compile_options(popsift-demo PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:-include${CMAKE_CURRENT_SOURCE_DIR}/../popsift/cuda_to_hip.h>)
else()
set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11)
endif()

target_compile_options(popsift-demo PRIVATE ${PD_COMPILE_OPTIONS} )
target_include_directories(popsift-demo PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
Expand All @@ -84,7 +101,17 @@ target_link_libraries(popsift-demo PUBLIC PopSift::popsift ${PD_LINK_LIBS})

add_executable(popsift-match match.cpp pgmread.cpp pgmread.h)

set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11)
if(USE_HIP)
# See popsift-demo above: plain-C++ host consumer, no device code, no "-x hip".
set_property(TARGET popsift-match PROPERTY CXX_STANDARD 17)
target_compile_definitions(popsift-match PRIVATE USE_HIP)
target_include_directories(popsift-match PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../popsift/hip_compat)
target_compile_options(popsift-match PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:-include${CMAKE_CURRENT_SOURCE_DIR}/../popsift/cuda_to_hip.h>)
else()
set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11)
endif()

target_compile_options(popsift-match PRIVATE ${PD_COMPILE_OPTIONS} )
target_include_directories(popsift-match PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
Expand Down
Loading