Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -682,9 +682,26 @@ if get_option('build_backends')
deps += cc.find_library('mkl_core', required: true)
deps += cc.find_library('OpenCL', required: true)
elif get_option('sycl') == 'amd'
error('Building SYCL for AMD backend not yet supported')
deps += cc.find_library('hipblas', required: true)
deps += cc.find_library('amdhip64', required: true)
add_project_arguments('-DUSE_HIPBLAS=ON', language : 'cpp')
add_project_arguments('-D__HIP_PLATFORM_AMD__', language : 'cpp')
if get_option('amd_gfx') == ''
error('-Dsycl=amd requires specifying -Damd_gfx architecture identifier (e.g. 90a, 1100 or similar)')
endif
add_project_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp')
add_project_link_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp')
else
error('Building SYCL for the NVIDIA backend not yet supported')
deps += cc.find_library('cublas', required: true)
deps += cc.find_library('cudart', required: true)
add_project_arguments('-DUSE_CUBLAS=ON', language : 'cpp')
if get_option('cc_cuda') != ''
sycl_nvidia_target = 'nvidia_gpu_sm_' + get_option('cc_cuda')
else
sycl_nvidia_target = 'nvptx64-nvidia-cuda'
endif
add_project_arguments('-fsycl-targets='+sycl_nvidia_target, language : 'cpp')
add_project_link_arguments('-fsycl-targets='+sycl_nvidia_target, language : 'cpp')
endif
if host_machine.system() == 'windows'
# For sycl under windows we need to link using icx to generate the device code.
Expand Down
5 changes: 5 additions & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,11 @@ option('cc_cuda',
value: '',
description: 'Build for a specific cuda CC, e.g. -Dcc_cuda=35 for CC 3.5')

option('amd_gfx',
type: 'string',
value: '',
description: 'Build for a specific AMD GPU architecture, e.g. -Damd_gfx=90a for gfx90a')

option('onnx_libdir',
type: 'string',
value: '',
Expand Down
16 changes: 11 additions & 5 deletions src/neural/backends/sycl/common_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@
#include "winograd_helper.h"
#include <cmath>

#if defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
Copy link

Copilot AI Jun 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The SYCL_SUB_GROUP_SIZE macro is defined locally here; since it’s duplicated in multiple files, consider extracting it to a shared header to avoid divergence and improve maintainability.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good point, moved to sycl_common.h

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible that we may also need to set SYCL_SUB_GROUP_SIZE for future architectures?
Then it may make sense to use this pattern:

#if __has_include("params_override.h")
#include "params_override.h"
#endif

#ifndef SYCL_SUB_GROUP_SIZE
#if defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
#define SYCL_SUB_GROUP_SIZE 64
#else
#define SYCL_SUB_GROUP_SIZE 32
#endif
#endif

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you'd like an interesting read, the full context for this very specific definition is ROCm/ROCm#4121 where an AMD engineer recommended this claiming it "will work without needing revisiting in the foreseeable future". I also can't imagine the other major vendors not supporting a sub-group size of 32 any time soon. Supporting other SYCL devices than AMD/Intel/NVIDIA GPUs would require big changes to the code as it is currently, so I'm quite confident this won't be needed for now. I would suggest implementing this pattern if/when a need for this comes up, but if you think it's useful I don't mind adding it.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems fine then.

#define SYCL_SUB_GROUP_SIZE 64
#else
#define SYCL_SUB_GROUP_SIZE 32
#endif

namespace lczero {
namespace sycldnn_backend {
namespace {
Expand Down Expand Up @@ -936,7 +942,7 @@ void globalAvgPool(int N, int C, T* output, const T* input,
sycl::nd_range<3>(
sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
sycl::range<3>(1, 1, kBlockSize)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
globalAvgPool_kernel(output, input, prevLayerBias, N * C * kPlaneSize,
N * C, C, item_ct1);
});
Expand Down Expand Up @@ -1070,7 +1076,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
sycl::range<3>(1, 1, C)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
OutputTransform_SE_relu_InputTransform_kernel<float, activation,
use_bias, use_skip>(
N, C, se_K, output, input, (float*)skip, bias, w1, b1, w2, b2,
Expand Down Expand Up @@ -1218,7 +1224,7 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::que
sycl::nd_range<3>(
sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
sycl::range<3>(1, 1, kBlockSize)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
softmax_opt_64_kernel<T>(output, input, input2, size, item_ct1);
});
}
Expand All @@ -1235,7 +1241,7 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::que
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
sycl::range<3>(1, 1, C)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
softmax_kernel<T>(output, input, input2, item_ct1, sum_acc_ct1,
maxval_acc_ct1);
});
Expand Down Expand Up @@ -1461,7 +1467,7 @@ void LayerNorm(int N, int C, T* output, const T* input, const T* bias,

cgh.parallel_for(
sycl::nd_range<3>(gridDim * blockDim, blockDim),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
layer_norm_kernel<T>(N, C, output, input, bias, skip, gammas, betas,
ep, alpha, act, item_ct1, sum_acc_ct1);
});
Expand Down
10 changes: 8 additions & 2 deletions src/neural/backends/sycl/fp16_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@
#endif
#include "winograd_helper.h"

#if defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
Copy link

Copilot AI Jun 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] This defines SYCL_SUB_GROUP_SIZE again; extracting it to a common header would reduce duplication and ensure consistency across kernels.

Copilot uses AI. Check for mistakes.
#define SYCL_SUB_GROUP_SIZE 64
#else
#define SYCL_SUB_GROUP_SIZE 32
#endif

namespace lczero {
namespace sycldnn_backend {

Expand Down Expand Up @@ -749,7 +755,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
sycl::range<3>(1, 1, C)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
32)]] {
SYCL_SUB_GROUP_SIZE)]] {
OutputInputTransformKernel_fp16_shmem_board<activation,
use_bias, use_skip>(
N, C, se_K, (sycl::half*)output, (const sycl::half*)input,
Expand Down Expand Up @@ -798,7 +804,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
sycl::range<3>(1, 1, C)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
OutputTransform_SE_relu_InputTransform_kernel<
sycl::half, activation, use_bias, use_skip>(
N, C, se_K, output, input, (sycl::half*)skip, bias, w1, b1, w2,
Expand Down
82 changes: 62 additions & 20 deletions src/neural/backends/sycl/layers.cc.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
half alpha = one_h;
half beta = zero_h;

#elif defined(USE_HIPBLAS)
hipblasHalf alpha{1};
hipblasHalf beta{0};

#else
sycl::half alpha = 1;
sycl::half beta = 0;
Expand Down Expand Up @@ -393,10 +397,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
hipblasSetStream(handle, hipStreamHandle);

hipblasSgemm(handle, transpose_type_transpose,
hipblasHgemm(handle, transpose_type_transpose,
transpose_type_notranspose,numFc1Out_, N, C, &alpha,
((const sycl::half *)w1_), C, ((const sycl::half *)op2), C,
&beta, ((sycl::half *)op1), numFc1Out_);
((const hipblasHalf *)w1_), C, ((const hipblasHalf *)op2), C,
&beta, ((hipblasHalf *)op1), numFc1Out_);

hipStreamSynchronize(hipStreamHandle);
});
Expand Down Expand Up @@ -436,10 +440,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
hipblasSetStream(handle, hipStreamHandle);

hipblasSgemm(
hipblasHgemm(
handle, transpose_type_transpose, transpose_type_notranspose, 2 * C,
N, numFc1Out_, &alpha,((const sycl::half *)w2_), numFc1Out_,
((const sycl::half *)op1), numFc1Out_, &beta, ((sycl::half *)op2),
N, numFc1Out_, &alpha,((const hipblasHalf *)w2_), numFc1Out_,
((const hipblasHalf *)op1), numFc1Out_, &beta, ((hipblasHalf *)op2),
2 * C);

hipStreamSynchronize(hipStreamHandle);
Expand Down Expand Up @@ -544,6 +548,10 @@ template <>
half alpha = one_h;
half beta = zero_h;

#elif defined(USE_HIPBLAS)
hipblasHalf alpha{1};
hipblasHalf beta{0};

#else
sycl::half alpha = 1;
sycl::half beta = 0;
Expand Down Expand Up @@ -576,11 +584,11 @@ template <>
sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
hipblasSetStream(handle, hipStreamHandle);

hipblasSgemm(
hipblasHgemm(
handle, transpose_type_transpose, transpose_type_notranspose,
num_outputs, N, num_inputs, &alpha, ((const sycl::half *)weights_),
num_inputs, ((const sycl::half *)input_tensor), num_inputs, &beta,
((sycl::half *)output_tensor), num_outputs);
num_outputs, N, num_inputs, &alpha, ((const hipblasHalf *)weights_),
num_inputs, ((const hipblasHalf *)input_tensor), num_inputs, &beta,
((hipblasHalf *)output_tensor), num_outputs);

hipStreamSynchronize(hipStreamHandle);
});
Expand Down Expand Up @@ -964,7 +972,7 @@ template <>

hipStreamSynchronize(hipStreamHandle);
});
);
});
#else
int64_t M_ = M;
int64_t N_ = N;
Expand Down Expand Up @@ -1807,7 +1815,20 @@ static void cublasXgemm(transpose_type transa,
});
}
#elif defined(USE_HIPBLAS)
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
if (fp16) {
Copy link

Copilot AI Jun 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The if (fp16) blocks across multiple GEMM routines duplicate conversion and submission logic; extracting common fp16-path code into a helper could reduce repetition and simplify future updates.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactoring the code structure is beyond the scope of this PR which brings minimal changes required to enable the backends while keeping everything else untouched. The new blocks in the HIP path follow the same style as the existing blocks for the CUDA path.

unsigned short alpha_h = FP32toFP16(alpha);
unsigned short beta_h = FP32toFP16(beta);
sycl_queue.submit([&](sycl::handler &cgh) {
cgh.host_task([=](sycl::interop_handle ih) {
auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
hipblasSetStream(handle, hipStreamHandle);
hipblasHgemm(handle, transa, transb, m, n, k, &alpha_h, (const hipblasHalf*)A,
lda, (const hipblasHalf*)B, ldb, &beta_h, (hipblasHalf*)C, ldc);
hipStreamSynchronize(hipStreamHandle);
});
});
} else {
sycl_queue.submit([&](sycl::handler &cgh) {
cgh.host_task([=](sycl::interop_handle ih) {
auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
Expand All @@ -1816,6 +1837,7 @@ static void cublasXgemm(transpose_type transa,
hipStreamSynchronize(hipStreamHandle);
});
});
}
#else
oneapi::mkl::blas::column_major::gemm(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda,
(const DataType *)B, ldb, beta, (DataType *)C, ldc);
Expand Down Expand Up @@ -1873,9 +1895,29 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran
});
}
#elif defined(USE_HIPBLAS)
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
if (fp16) {
unsigned short alpha_h = FP32toFP16(alpha);
unsigned short beta_h = FP32toFP16(beta);

sycl_queue.submit([&](sycl::handler &cgh) {

cgh.host_task([=](sycl::interop_handle ih) {

auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
hipblasSetStream(handle, hipStreamHandle);

hipblasGemmStridedBatchedEx(
handle, transa, transb, m, n, k, &alpha_h, A, HIPBLAS_R_16F, lda, strideA, B,
HIPBLAS_R_16F, ldb, strideB, &beta_h, C, HIPBLAS_R_16F, ldc, strideC,
batchCount, HIPBLAS_R_16F, HIPBLAS_GEMM_DEFAULT);

hipStreamSynchronize(hipStreamHandle);

sycl_queue.submit([&](sycl::handler &cgh) {
});
});
} else {
sycl_queue.submit([&](sycl::handler &cgh) {

cgh.host_task([=](sycl::interop_handle ih) {

Expand All @@ -1891,9 +1933,10 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran

});
});
#else
oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount);
#endif
}
#else
oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount);
#endif
}

template <typename DataType>
Expand Down Expand Up @@ -1962,8 +2005,8 @@ static void cublasXGemmBatched(transpose_type transa,
hipblasSetStream(handle, hipStreamHandle);

hipblasHgemmBatched(
handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda,
(half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount);
handle, transa, transb, m, n, k, (const hipblasHalf*)&alpha_h, (hipblasHalf**)A, lda,
(hipblasHalf**)B, ldb, (const hipblasHalf*)&beta_h, (hipblasHalf**)C, ldc, batchCount);

hipStreamSynchronize(hipStreamHandle);

Expand Down Expand Up @@ -2507,7 +2550,6 @@ template <typename DataType>
AttentionBody<DataType>::~AttentionBody() {
sycl::free(ip_emb_w_, sycl_queue_);
sycl::free(ip_emb_b_, sycl_queue_);
sycl::free(pos_encoding_, sycl_queue_);
if (is_pe_dense_embedding_) {
sycl::free(ip_emb_pre_w_, sycl_queue_);
sycl::free(ip_emb_pre_b_, sycl_queue_);
Expand Down