Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -682,9 +682,26 @@ if get_option('build_backends')
deps += cc.find_library('mkl_core', required: true)
deps += cc.find_library('OpenCL', required: true)
elif get_option('sycl') == 'amd'
error('Building SYCL for AMD backend not yet supported')
deps += cc.find_library('hipblas', required: true)
deps += cc.find_library('amdhip64', required: true)
add_project_arguments('-DUSE_HIPBLAS=ON', language : 'cpp')
add_project_arguments('-D__HIP_PLATFORM_AMD__', language : 'cpp')
if get_option('amd_gfx') == ''
error('-Dsycl=amd requires specifying -Damd_gfx architecture identifier (e.g. 90a, 1100 or similar)')
endif
add_project_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp')
add_project_link_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp')
else
error('Building SYCL for the NVIDIA backend not yet supported')
deps += cc.find_library('cublas', required: true)
deps += cc.find_library('cudart', required: true)
add_project_arguments('-DUSE_CUBLAS=ON', language : 'cpp')
if get_option('cc_cuda') != ''
sycl_nvidia_target = 'nvidia_gpu_sm_' + get_option('cc_cuda')
else
sycl_nvidia_target = 'nvptx64-nvidia-cuda'
endif
add_project_arguments('-fsycl-targets='+sycl_nvidia_target, language : 'cpp')
add_project_link_arguments('-fsycl-targets='+sycl_nvidia_target, language : 'cpp')
endif
if host_machine.system() == 'windows'
# For sycl under windows we need to link using icx to generate the device code.
Expand Down
5 changes: 5 additions & 0 deletions meson_options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,11 @@ option('cc_cuda',
value: '',
description: 'Build for a specific cuda CC, e.g. -Dcc_cuda=35 for CC 3.5')

option('amd_gfx',
type: 'string',
value: '',
description: 'Build for a specific AMD GPU architecture, e.g. -Damd_gfx=90a for gfx90a')

option('onnx_libdir',
type: 'string',
value: '',
Expand Down
10 changes: 5 additions & 5 deletions src/neural/backends/sycl/common_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -936,7 +936,7 @@ void globalAvgPool(int N, int C, T* output, const T* input,
sycl::nd_range<3>(
sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
sycl::range<3>(1, 1, kBlockSize)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
globalAvgPool_kernel(output, input, prevLayerBias, N * C * kPlaneSize,
N * C, C, item_ct1);
});
Expand Down Expand Up @@ -1070,7 +1070,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
sycl::range<3>(1, 1, C)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
OutputTransform_SE_relu_InputTransform_kernel<float, activation,
use_bias, use_skip>(
N, C, se_K, output, input, (float*)skip, bias, w1, b1, w2, b2,
Expand Down Expand Up @@ -1218,7 +1218,7 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::que
sycl::nd_range<3>(
sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize),
sycl::range<3>(1, 1, kBlockSize)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
softmax_opt_64_kernel<T>(output, input, input2, size, item_ct1);
});
}
Expand All @@ -1235,7 +1235,7 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::que
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
sycl::range<3>(1, 1, C)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
softmax_kernel<T>(output, input, input2, item_ct1, sum_acc_ct1,
maxval_acc_ct1);
});
Expand Down Expand Up @@ -1461,7 +1461,7 @@ void LayerNorm(int N, int C, T* output, const T* input, const T* bias,

cgh.parallel_for(
sycl::nd_range<3>(gridDim * blockDim, blockDim),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
layer_norm_kernel<T>(N, C, output, input, bias, skip, gammas, betas,
ep, alpha, act, item_ct1, sum_acc_ct1);
});
Expand Down
4 changes: 2 additions & 2 deletions src/neural/backends/sycl/fp16_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -749,7 +749,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
sycl::range<3>(1, 1, C)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(
32)]] {
SYCL_SUB_GROUP_SIZE)]] {
OutputInputTransformKernel_fp16_shmem_board<activation,
use_bias, use_skip>(
N, C, se_K, (sycl::half*)output, (const sycl::half*)input,
Expand Down Expand Up @@ -798,7 +798,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input,
cgh.parallel_for(
sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C),
sycl::range<3>(1, 1, C)),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] {
OutputTransform_SE_relu_InputTransform_kernel<
sycl::half, activation, use_bias, use_skip>(
N, C, se_K, output, input, (sycl::half*)skip, bias, w1, b1, w2,
Expand Down
82 changes: 62 additions & 20 deletions src/neural/backends/sycl/layers.cc.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -358,6 +358,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
half alpha = one_h;
half beta = zero_h;

#elif defined(USE_HIPBLAS)
hipblasHalf alpha{1};
hipblasHalf beta{0};

#else
sycl::half alpha = 1;
sycl::half beta = 0;
Expand Down Expand Up @@ -393,10 +397,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
hipblasSetStream(handle, hipStreamHandle);

hipblasSgemm(handle, transpose_type_transpose,
hipblasHgemm(handle, transpose_type_transpose,
transpose_type_notranspose,numFc1Out_, N, C, &alpha,
((const sycl::half *)w1_), C, ((const sycl::half *)op2), C,
&beta, ((sycl::half *)op1), numFc1Out_);
((const hipblasHalf *)w1_), C, ((const hipblasHalf *)op2), C,
&beta, ((hipblasHalf *)op1), numFc1Out_);

hipStreamSynchronize(hipStreamHandle);
});
Expand Down Expand Up @@ -436,10 +440,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu
sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
hipblasSetStream(handle, hipStreamHandle);

hipblasSgemm(
hipblasHgemm(
handle, transpose_type_transpose, transpose_type_notranspose, 2 * C,
N, numFc1Out_, &alpha,((const sycl::half *)w2_), numFc1Out_,
((const sycl::half *)op1), numFc1Out_, &beta, ((sycl::half *)op2),
N, numFc1Out_, &alpha,((const hipblasHalf *)w2_), numFc1Out_,
((const hipblasHalf *)op1), numFc1Out_, &beta, ((hipblasHalf *)op2),
2 * C);

hipStreamSynchronize(hipStreamHandle);
Expand Down Expand Up @@ -544,6 +548,10 @@ template <>
half alpha = one_h;
half beta = zero_h;

#elif defined(USE_HIPBLAS)
hipblasHalf alpha{1};
hipblasHalf beta{0};

#else
sycl::half alpha = 1;
sycl::half beta = 0;
Expand Down Expand Up @@ -576,11 +584,11 @@ template <>
sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
hipblasSetStream(handle, hipStreamHandle);

hipblasSgemm(
hipblasHgemm(
handle, transpose_type_transpose, transpose_type_notranspose,
num_outputs, N, num_inputs, &alpha, ((const sycl::half *)weights_),
num_inputs, ((const sycl::half *)input_tensor), num_inputs, &beta,
((sycl::half *)output_tensor), num_outputs);
num_outputs, N, num_inputs, &alpha, ((const hipblasHalf *)weights_),
num_inputs, ((const hipblasHalf *)input_tensor), num_inputs, &beta,
((hipblasHalf *)output_tensor), num_outputs);

hipStreamSynchronize(hipStreamHandle);
});
Expand Down Expand Up @@ -964,7 +972,7 @@ template <>

hipStreamSynchronize(hipStreamHandle);
});
);
});
#else
int64_t M_ = M;
int64_t N_ = N;
Expand Down Expand Up @@ -1807,7 +1815,20 @@ static void cublasXgemm(transpose_type transa,
});
}
#elif defined(USE_HIPBLAS)
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
if (fp16) {
Copy link

Copilot AI Jun 19, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The if (fp16) blocks across multiple GEMM routines duplicate conversion and submission logic; extracting common fp16-path code into a helper could reduce repetition and simplify future updates.

Copilot uses AI. Check for mistakes.
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Refactoring the code structure is beyond the scope of this PR which brings minimal changes required to enable the backends while keeping everything else untouched. The new blocks in the HIP path follow the same style as the existing blocks for the CUDA path.

unsigned short alpha_h = FP32toFP16(alpha);
unsigned short beta_h = FP32toFP16(beta);
sycl_queue.submit([&](sycl::handler &cgh) {
cgh.host_task([=](sycl::interop_handle ih) {
auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
hipblasSetStream(handle, hipStreamHandle);
hipblasHgemm(handle, transa, transb, m, n, k, &alpha_h, (const hipblasHalf*)A,
lda, (const hipblasHalf*)B, ldb, &beta_h, (hipblasHalf*)C, ldc);
hipStreamSynchronize(hipStreamHandle);
});
});
} else {
sycl_queue.submit([&](sycl::handler &cgh) {
cgh.host_task([=](sycl::interop_handle ih) {
auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
Expand All @@ -1816,6 +1837,7 @@ static void cublasXgemm(transpose_type transa,
hipStreamSynchronize(hipStreamHandle);
});
});
}
#else
oneapi::mkl::blas::column_major::gemm(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda,
(const DataType *)B, ldb, beta, (DataType *)C, ldc);
Expand Down Expand Up @@ -1873,9 +1895,29 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran
});
}
#elif defined(USE_HIPBLAS)
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t();
if (fp16) {
unsigned short alpha_h = FP32toFP16(alpha);
unsigned short beta_h = FP32toFP16(beta);

sycl_queue.submit([&](sycl::handler &cgh) {

cgh.host_task([=](sycl::interop_handle ih) {

auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue);
hipblasSetStream(handle, hipStreamHandle);

hipblasGemmStridedBatchedEx(
handle, transa, transb, m, n, k, &alpha_h, A, HIPBLAS_R_16F, lda, strideA, B,
HIPBLAS_R_16F, ldb, strideB, &beta_h, C, HIPBLAS_R_16F, ldc, strideC,
batchCount, HIPBLAS_R_16F, HIPBLAS_GEMM_DEFAULT);

hipStreamSynchronize(hipStreamHandle);

sycl_queue.submit([&](sycl::handler &cgh) {
});
});
} else {
sycl_queue.submit([&](sycl::handler &cgh) {

cgh.host_task([=](sycl::interop_handle ih) {

Expand All @@ -1891,9 +1933,10 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran

});
});
#else
oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount);
#endif
}
#else
oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount);
#endif
}

template <typename DataType>
Expand Down Expand Up @@ -1962,8 +2005,8 @@ static void cublasXGemmBatched(transpose_type transa,
hipblasSetStream(handle, hipStreamHandle);

hipblasHgemmBatched(
handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda,
(half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount);
handle, transa, transb, m, n, k, (const hipblasHalf*)&alpha_h, (hipblasHalf**)A, lda,
(hipblasHalf**)B, ldb, (const hipblasHalf*)&beta_h, (hipblasHalf**)C, ldc, batchCount);

hipStreamSynchronize(hipStreamHandle);

Expand Down Expand Up @@ -2507,7 +2550,6 @@ template <typename DataType>
AttentionBody<DataType>::~AttentionBody() {
sycl::free(ip_emb_w_, sycl_queue_);
sycl::free(ip_emb_b_, sycl_queue_);
sycl::free(pos_encoding_, sycl_queue_);
if (is_pe_dense_embedding_) {
sycl::free(ip_emb_pre_w_, sycl_queue_);
sycl::free(ip_emb_pre_b_, sycl_queue_);
Expand Down
6 changes: 6 additions & 0 deletions src/neural/backends/sycl/sycl_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@

#include "utils/exception.h"

#if defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__))
#define SYCL_SUB_GROUP_SIZE 64
#else
#define SYCL_SUB_GROUP_SIZE 32
#endif

namespace lczero {
namespace sycldnn_backend {

Expand Down