Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,5 @@ oxford

# Downloaded archives for tests.
*.tgz

build-hip/
170 changes: 103 additions & 67 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,26 +1,38 @@
# First-order language CUDA requires at least CMake 3.18
cmake_minimum_required(VERSION 3.24)

# Default seletion of CUDA Compute Capabilities.
# This must be called before project() or cmake sets it to the oldest non-deprecated CC
# "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson.
if(EXISTS "/etc/nv_tegra_release")
# The CC list for Tegras and Jetson will require manual updates
set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87"
CACHE
STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
else()
# The CC list for discrete GPUs will require CMake updates
set(CMAKE_CUDA_ARCHITECTURES "all-major"
CACHE
STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
# USE_HIP routes the .cu translation units through the HIP toolchain for AMD
# GPUs. Everything else keeps its plain
# CUDA spelling; cuda_to_hip.h (force-included for HIP in src/CMakeLists.txt)
# does the symbol translation. The NVIDIA path (USE_HIP=OFF) is unchanged.
option(USE_HIP "Build with HIP for AMD GPUs instead of CUDA" OFF)

if(NOT USE_HIP)
# Default seletion of CUDA Compute Capabilities.
# This must be called before project() or cmake sets it to the oldest non-deprecated CC
# "all" and "all-major" work for Intel and perhaps for ARM with discrete GPUs, but not Tegra and Jetson.
if(EXISTS "/etc/nv_tegra_release")
# The CC list for Tegras and Jetson will require manual updates
set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87"
CACHE
STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
else()
# The CC list for discrete GPUs will require CMake updates
set(CMAKE_CUDA_ARCHITECTURES "all-major"
CACHE
STRING "Which CUDA CCs to support: native, all, all-major or an explicit list delimited by semicolons")
endif()
endif()

project(PopSift VERSION 0.10.1 LANGUAGES CXX CUDA)
if(USE_HIP)
project(PopSift VERSION 0.10.1 LANGUAGES CXX HIP)
else()
project(PopSift VERSION 0.10.1 LANGUAGES CXX CUDA)

# Policy to support CUDA as a first-order language for CMake.
# Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html
cmake_policy(SET CMP0104 NEW)
# Policy to support CUDA as a first-order language for CMake.
# Since CMake 3.18. See https://cmake.org/cmake/help/latest/policy/CMP0104.html
cmake_policy(SET CMP0104 NEW)
endif()

# Set build path as a folder named as the platform (linux, windows, darwin...) plus the processor type
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/${CMAKE_SYSTEM_NAME}-${CMAKE_SYSTEM_PROCESSOR}")
Expand Down Expand Up @@ -75,70 +87,94 @@ if(BUILD_SHARED_LIBS)
# Auto-build dll exports on Windows
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)

set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
if(NOT USE_HIP)
set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
endif()
else()
message(STATUS "BUILD_SHARED_LIBS OFF")

set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
if(NOT USE_HIP)
set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
endif()
endif()

# Require threads because of std::thread.
find_package(Threads REQUIRED)

###################
# CUDA
###################
include(CheckLanguage)
check_language(CUDA)

# Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path"
# target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart)
find_package(CUDAToolkit)

message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}")
set(CUDA_VERSION ${CUDAToolkit_VERSION})

if(PopSift_ERRCHK_AFTER_KERNEL)
message(STATUS "Synchronizing and checking errors after every kernel call")
list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL")
endif()

# This may not be required any more.
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

# default stream per-thread implies that each host thread has one non-synchronizing 0-stream
# currently, the code requires legacy mode
list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy")
# set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--default-stream;per-thread")
if(USE_HIP)
###################
# HIP (AMD)
###################
# rocThrust supplies the thrust::cuda::par compatibility namespace, so the
# Thrust code in s_filtergrid.cu compiles unchanged on ROCm.
find_package(rocthrust REQUIRED CONFIG)

set(PopSift_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_HIP_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_HIP_STANDARD_REQUIRED ON)
set(CMAKE_HIP_SEPARABLE_COMPILATION ON)

# HIP has __shfl_down etc. unconditionally; assist.h selects the _sync
# spelling on this flag and cuda_to_hip.h maps _sync to the mask-free builtin.
set(PopSift_HAVE_SHFL_DOWN_SYNC 1)
else()
###################
# CUDA
###################
include(CheckLanguage)
check_language(CUDA)

# Use this if necessary: "cmake -DCUDAToolkit_ROOT=/some/path"
# target_link_libraries(binary_linking_to_cudart PRIVATE CUDA::cudart)
find_package(CUDAToolkit)

message(STATUS "CUDA Version is ${CUDAToolkit_VERSION}")
set(CUDA_VERSION ${CUDAToolkit_VERSION})

if(PopSift_ERRCHK_AFTER_KERNEL)
message(STATUS "Synchronizing and checking errors after every kernel call")
list(APPEND CUDA_NVCC_FLAGS "-DERRCHK_AFTER_KERNEL")
endif()

if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
if(PopSift_NVCC_WARNINGS)
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills")
# This may not be required any more.
set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

# default stream per-thread implies that each host thread has one non-synchronizing 0-stream
# currently, the code requires legacy mode
list(APPEND CUDA_NVCC_FLAGS "--default-stream;legacy")
# set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};--default-stream;per-thread")

if(CUDA_VERSION VERSION_GREATER_EQUAL "7.5")
if(PopSift_NVCC_WARNINGS)
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-lmem-usage")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;-warn-spills")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-local-memory-usage")
list(APPEND CUDA_NVCC_FLAGS_RELEASE "-Xptxas;--warn-on-spills")
endif()
endif()
endif()

set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11
if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
set(PopSift_CXX_STANDARD 11)
endif()
set(PopSift_CXX_STANDARD 17) # Thrust/CUB requires C++14 starting with CUDA SDK 11
if(CUDA_VERSION_MAJOR LESS_EQUAL 8)
set(PopSift_CXX_STANDARD 11)
endif()

if(NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}")
list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}")
endif()
set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
if(NOT MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${PopSift_CXX_STANDARD}")
list(APPEND CUDA_NVCC_FLAGS "-std=c++${PopSift_CXX_STANDARD}")
endif()
set(CMAKE_CXX_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD ${PopSift_CXX_STANDARD})
set(CMAKE_CUDA_STANDARD_REQUIRED ON)


if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
set(PopSift_HAVE_SHFL_DOWN_SYNC 1)
else()
set(PopSift_HAVE_SHFL_DOWN_SYNC 0)
if(CUDA_VERSION VERSION_GREATER_EQUAL "9.0")
set(PopSift_HAVE_SHFL_DOWN_SYNC 1)
else()
set(PopSift_HAVE_SHFL_DOWN_SYNC 0)
endif()
endif()

if(NOT PopSift_USE_GRID_FILTER)
Expand Down
35 changes: 35 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ PopSift compiles and works with NVidia cards of compute capability >= 3.0 (inclu

CUDA SDK 11 does no longer support compute capability 3.0. 3.5 is still supported with deprecation warnings.

PopSift also runs on AMD GPUs through ROCm/HIP (validated on gfx90a and gfx1100, and on Windows gfx1151); see the `USE_HIP` build option below.

## Dependencies

PopSift depends on:
Expand Down Expand Up @@ -46,6 +48,39 @@ Some build options are available:

* `BUILD_SHARED_LIBS` (default: `ON`) controls the type of library to build (`ON` for shared libraries, `OFF` for static)

* `USE_HIP` (default: `OFF`) builds for AMD GPUs through ROCm/HIP instead of CUDA (requires a ROCm installation); see "Building for AMD GPUs (ROCm/HIP)" below.

### Building for AMD GPUs (ROCm/HIP)

With a ROCm installation, configure with `-DUSE_HIP=ON` to build the GPU code through HIP instead of CUDA. The CUDA build path is unchanged when `USE_HIP=OFF` (the default).

```shell
cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx90a \
-DCMAKE_HIP_COMPILER=/opt/rocm/llvm/bin/clang++
make
```

Set the target architecture with `-DCMAKE_HIP_ARCHITECTURES` (e.g. `gfx90a`, `gfx1100`); pass a semicolon-separated list such as `"gfx90a;gfx1100"` to emit a fat binary.

#### Windows (gfx1151 / TheRock ROCm)

Tested with the ROCm wheels from [TheRock](https://github.com/ROCm/TheRock) (see its README for installing the Windows wheels). The steps below assume `<rocm>` is that install prefix and that `clang` from it is on `PATH`:

```shell
cmake .. -DUSE_HIP=ON -DCMAKE_HIP_ARCHITECTURES=gfx1151 ^
-DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang -DCMAKE_HIP_COMPILER=clang ^
-DCMAKE_PREFIX_PATH=<rocm> -DPopSift_BUILD_EXAMPLES=OFF ^
-DCMAKE_LINKER_TYPE=LLDFIX ^
-DCMAKE_C_USING_LINKER_LLDFIX=-fuse-ld=lld ^
-DCMAKE_CXX_USING_LINKER_LLDFIX=-fuse-ld=lld ^
-DCMAKE_HIP_USING_LINKER_LLDFIX=-fuse-ld=lld
cmake --build . --config Release
```

The `CMAKE_LINKER_TYPE=LLDFIX` block works around the `-fgpu-rdc` device link: CMake otherwise emits `-fuse-ld=lld-link`, which AMD clang rejects under `--hip-link`. At runtime, copy a matching `amdhip64` and `amd_comgr` from `<rocm>` next to the executable (a System32 Adrenalin-driver runtime can be device-library mismatched).

On the gfx1151 APU, `maxTexture2DLayered` is smaller than on gfx90a, so the default upscale can overflow for large images; use native-resolution downsampling (`Config::setDownsampling(0)`) or cap the input size.

## Usage

The main artifact created is `libpopsift`.
Expand Down
60 changes: 58 additions & 2 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,70 @@ add_library(popsift
popsift/common/vec_macros.h
popsift/common/clamp.h)

if(USE_HIP)
# Collect the .cu translation units so the HIP build can mark them LANGUAGE HIP.
# Under USE_HIP, .cu would otherwise default to CUDA (a language not enabled in
# the HIP build), so the language must be set explicitly. This is only needed on
# the HIP path, so the collection lives inside the guard.
get_target_property(_popsift_sources popsift SOURCES)
set(_popsift_cu_sources "")
foreach(_src ${_popsift_sources})
if(_src MATCHES "\\.cu$")
list(APPEND _popsift_cu_sources "${_src}")
endif()
endforeach()
set_source_files_properties(${_popsift_cu_sources} PROPERTIES LANGUAGE HIP)
endif()

target_link_libraries(popsift
PUBLIC
CUDA::cudart
Threads::Threads)
if(USE_HIP)
# rocThrust transitively links hip::device, whose INTERFACE_COMPILE_OPTIONS inject
# "-x hip" for COMPILE_LANGUAGE:CXX. Keep that device-compile interface PRIVATE so
# it does not leak "-x hip" onto plain-C++ consumers of popsift (the example apps).
# Consumers still need the HIP runtime to link and its headers to satisfy the
# public <cuda_runtime.h> include, so expose hip::host PUBLIC (link + headers, no
# "-x hip"). The library .cu sources are already LANGUAGE HIP, so they get the HIP
# compiler regardless of the link visibility.
target_link_libraries(popsift PRIVATE roc::rocthrust)
target_link_libraries(popsift PUBLIC hip::host)
else()
target_link_libraries(popsift PUBLIC CUDA::cudart)
endif()

set_target_properties(popsift PROPERTIES VERSION ${PROJECT_VERSION})
set_target_properties(popsift PROPERTIES DEBUG_POSTFIX "d")
set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
if(USE_HIP)
if(NOT DEFINED CMAKE_HIP_ARCHITECTURES OR CMAKE_HIP_ARCHITECTURES STREQUAL "")
set(CMAKE_HIP_ARCHITECTURES "gfx90a")
endif()
set_target_properties(popsift PROPERTIES
HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}")
# USE_HIP selects the HIP branch of cuda_to_hip.h. __HIP_PLATFORM_AMD__ is
# only defined once the HIP runtime header is included, so it cannot gate the
# include of that very header; the explicit define breaks the cycle.
target_compile_definitions(popsift PRIVATE USE_HIP)
# Force-include the compat header into every HIP translation unit so no
# per-file #include edits are needed (minimal footprint).
# -fgpu-rdc keeps device code relocatable so the cross-TU __device__/__constant__
# globals (d_consts, d_gauss, dct/dbuf/dobuf -- declared extern in headers,
# defined in one .cu, used in many) are resolved by the final --hip-link step.
# Without it each .o is device-linked standalone and those references are
# undefined. The matching -fgpu-rdc on the link rule (below) performs the
# device link.
target_compile_options(popsift PRIVATE
$<$<COMPILE_LANGUAGE:HIP>:-include${CMAKE_CURRENT_SOURCE_DIR}/popsift/cuda_to_hip.h>
$<$<COMPILE_LANGUAGE:HIP>:-fgpu-rdc>)
target_link_options(popsift PRIVATE -fgpu-rdc --hip-link)
# hip_compat/ supplies a <cuda_runtime.h> shim for the sources that include it
# directly; HIP build only. PRIVATE so the public popsift.h <cuda_runtime.h>
# include still resolves to the real CUDA header for downstream NVIDIA users.
target_include_directories(popsift PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/popsift/hip_compat)
else()
set_target_properties(popsift PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
endif()
if(MSVC)
target_compile_options(popsift PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=/Zc:preprocessor>)
endif()
Expand Down
31 changes: 29 additions & 2 deletions src/application/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,24 @@ endif()

add_executable(popsift-demo main.cpp pgmread.cpp pgmread.h)

set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11)
if(USE_HIP)
# main.cpp/pgmread.cpp have NO device code; they consume popsift's host API only.
# popsift now links the HIP device-compile interface (roc::rocthrust -> hip::device,
# which injects "-x hip") PRIVATE and exposes only hip::host PUBLIC, so these
# sources compile as plain C++ (no "-x hip") and just link the HIP runtime. They
# still need the hip_compat <cuda_runtime.h> shim and the cuda_to_hip.h force-include
# because popsift's public headers (device_prop.h) #include <cuda_runtime.h>; the
# library keeps those PRIVATE, so the consumer adds them explicitly here. USE_HIP
# selects the HIP branch of cuda_to_hip.h. C++17 for the compat header.
set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 17)
target_compile_definitions(popsift-demo PRIVATE USE_HIP)
target_include_directories(popsift-demo PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../popsift/hip_compat)
target_compile_options(popsift-demo PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:-include${CMAKE_CURRENT_SOURCE_DIR}/../popsift/cuda_to_hip.h>)
else()
set_property(TARGET popsift-demo PROPERTY CXX_STANDARD 11)
endif()

target_compile_options(popsift-demo PRIVATE ${PD_COMPILE_OPTIONS} )
target_include_directories(popsift-demo PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
Expand All @@ -84,7 +101,17 @@ target_link_libraries(popsift-demo PUBLIC PopSift::popsift ${PD_LINK_LIBS})

add_executable(popsift-match match.cpp pgmread.cpp pgmread.h)

set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11)
if(USE_HIP)
# See popsift-demo above: plain-C++ host consumer, no device code, no "-x hip".
set_property(TARGET popsift-match PROPERTY CXX_STANDARD 17)
target_compile_definitions(popsift-match PRIVATE USE_HIP)
target_include_directories(popsift-match PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../popsift/hip_compat)
target_compile_options(popsift-match PRIVATE
$<$<COMPILE_LANGUAGE:CXX>:-include${CMAKE_CURRENT_SOURCE_DIR}/../popsift/cuda_to_hip.h>)
else()
set_property(TARGET popsift-match PROPERTY CXX_STANDARD 11)
endif()

target_compile_options(popsift-match PRIVATE ${PD_COMPILE_OPTIONS} )
target_include_directories(popsift-match PUBLIC PopSift::popsift ${PD_INCLUDE_DIRS})
Expand Down
Loading