diff --git a/meson.build b/meson.build index c43cce0395..fe10b0e977 100644 --- a/meson.build +++ b/meson.build @@ -682,9 +682,26 @@ if get_option('build_backends') deps += cc.find_library('mkl_core', required: true) deps += cc.find_library('OpenCL', required: true) elif get_option('sycl') == 'amd' - error('Building SYCL for AMD backend not yet supported') + deps += cc.find_library('hipblas', required: true) + deps += cc.find_library('amdhip64', required: true) + add_project_arguments('-DUSE_HIPBLAS=ON', language : 'cpp') + add_project_arguments('-D__HIP_PLATFORM_AMD__', language : 'cpp') + if get_option('amd_gfx') == '' + error('-Dsycl=amd requires specifying -Damd_gfx architecture identifier (e.g. 90a, 1100 or similar)') + endif + add_project_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp') + add_project_link_arguments('-fsycl-targets=amd_gpu_gfx'+get_option('amd_gfx'), language : 'cpp') else - error('Building SYCL for the NVIDIA backend not yet supported') + deps += cc.find_library('cublas', required: true) + deps += cc.find_library('cudart', required: true) + add_project_arguments('-DUSE_CUBLAS=ON', language : 'cpp') + if get_option('cc_cuda') != '' + sycl_nvidia_target = 'nvidia_gpu_sm_' + get_option('cc_cuda') + else + sycl_nvidia_target = 'nvptx64-nvidia-cuda' + endif + add_project_arguments('-fsycl-targets='+sycl_nvidia_target, language : 'cpp') + add_project_link_arguments('-fsycl-targets='+sycl_nvidia_target, language : 'cpp') endif if host_machine.system() == 'windows' # For sycl under windows we need to link using icx to generate the device code. diff --git a/meson_options.txt b/meson_options.txt index 8b7f425459..6f941d0c42 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -178,6 +178,11 @@ option('cc_cuda', value: '', description: 'Build for a specific cuda CC, e.g. -Dcc_cuda=35 for CC 3.5') +option('amd_gfx', + type: 'string', + value: '', + description: 'Build for a specific AMD GPU architecture, e.g. -Damd_gfx=90a for gfx90a') + option('onnx_libdir', type: 'string', value: '', diff --git a/src/neural/backends/sycl/common_kernels.dp.cpp b/src/neural/backends/sycl/common_kernels.dp.cpp index 0c01ad8a6f..65335e5e6a 100644 --- a/src/neural/backends/sycl/common_kernels.dp.cpp +++ b/src/neural/backends/sycl/common_kernels.dp.cpp @@ -936,7 +936,7 @@ void globalAvgPool(int N, int C, T* output, const T* input, sycl::nd_range<3>( sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize), sycl::range<3>(1, 1, kBlockSize)), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] { globalAvgPool_kernel(output, input, prevLayerBias, N * C * kPlaneSize, N * C, C, item_ct1); }); @@ -1070,7 +1070,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, cgh.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C), sycl::range<3>(1, 1, C)), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] { OutputTransform_SE_relu_InputTransform_kernel( N, C, se_K, output, input, (float*)skip, bias, w1, b1, w2, b2, @@ -1218,7 +1218,7 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::que sycl::nd_range<3>( sycl::range<3>(1, 1, blocks) * sycl::range<3>(1, 1, kBlockSize), sycl::range<3>(1, 1, kBlockSize)), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] { softmax_opt_64_kernel(output, input, input2, size, item_ct1); }); } @@ -1235,7 +1235,7 @@ void Softmax(int N, int C, T* output, const T* input, const T* input2, sycl::que cgh.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C), sycl::range<3>(1, 1, C)), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] { softmax_kernel(output, input, input2, item_ct1, sum_acc_ct1, maxval_acc_ct1); }); @@ -1461,7 +1461,7 @@ void LayerNorm(int N, int C, T* output, const T* input, const T* bias, cgh.parallel_for( sycl::nd_range<3>(gridDim * blockDim, blockDim), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] { layer_norm_kernel(N, C, output, input, bias, skip, gammas, betas, ep, alpha, act, item_ct1, sum_acc_ct1); }); diff --git a/src/neural/backends/sycl/fp16_kernels.dp.cpp b/src/neural/backends/sycl/fp16_kernels.dp.cpp index b0c2da139e..bb89b65a97 100644 --- a/src/neural/backends/sycl/fp16_kernels.dp.cpp +++ b/src/neural/backends/sycl/fp16_kernels.dp.cpp @@ -749,7 +749,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C), sycl::range<3>(1, 1, C)), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size( - 32)]] { + SYCL_SUB_GROUP_SIZE)]] { OutputInputTransformKernel_fp16_shmem_board( N, C, se_K, (sycl::half*)output, (const sycl::half*)input, @@ -798,7 +798,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, cgh.parallel_for( sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C), sycl::range<3>(1, 1, C)), - [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] { OutputTransform_SE_relu_InputTransform_kernel< sycl::half, activation, use_bias, use_skip>( N, C, se_K, output, input, (sycl::half*)skip, bias, w1, b1, w2, diff --git a/src/neural/backends/sycl/layers.cc.dp.cpp b/src/neural/backends/sycl/layers.cc.dp.cpp index 79b4f50ca5..fa49425a0e 100644 --- a/src/neural/backends/sycl/layers.cc.dp.cpp +++ b/src/neural/backends/sycl/layers.cc.dp.cpp @@ -358,6 +358,10 @@ void SELayer::Eval(int N, sycl::half* output, const sycl::half* inpu half alpha = one_h; half beta = zero_h; + #elif defined(USE_HIPBLAS) + hipblasHalf alpha{1}; + hipblasHalf beta{0}; + #else sycl::half alpha = 1; sycl::half beta = 0; @@ -393,10 +397,10 @@ void SELayer::Eval(int N, sycl::half* output, const sycl::half* inpu sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); - hipblasSgemm(handle, transpose_type_transpose, + hipblasHgemm(handle, transpose_type_transpose, transpose_type_notranspose,numFc1Out_, N, C, &alpha, - ((const sycl::half *)w1_), C, ((const sycl::half *)op2), C, - &beta, ((sycl::half *)op1), numFc1Out_); + ((const hipblasHalf *)w1_), C, ((const hipblasHalf *)op2), C, + &beta, ((hipblasHalf *)op1), numFc1Out_); hipStreamSynchronize(hipStreamHandle); }); @@ -436,10 +440,10 @@ void SELayer::Eval(int N, sycl::half* output, const sycl::half* inpu sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); - hipblasSgemm( + hipblasHgemm( handle, transpose_type_transpose, transpose_type_notranspose, 2 * C, - N, numFc1Out_, &alpha,((const sycl::half *)w2_), numFc1Out_, - ((const sycl::half *)op1), numFc1Out_, &beta, ((sycl::half *)op2), + N, numFc1Out_, &alpha,((const hipblasHalf *)w2_), numFc1Out_, + ((const hipblasHalf *)op1), numFc1Out_, &beta, ((hipblasHalf *)op2), 2 * C); hipStreamSynchronize(hipStreamHandle); @@ -544,6 +548,10 @@ template <> half alpha = one_h; half beta = zero_h; + #elif defined(USE_HIPBLAS) + hipblasHalf alpha{1}; + hipblasHalf beta{0}; + #else sycl::half alpha = 1; sycl::half beta = 0; @@ -576,11 +584,11 @@ template <> sycl::get_native(sycl_queue); hipblasSetStream(handle, hipStreamHandle); - hipblasSgemm( + hipblasHgemm( handle, transpose_type_transpose, transpose_type_notranspose, - num_outputs, N, num_inputs, &alpha, ((const sycl::half *)weights_), - num_inputs, ((const sycl::half *)input_tensor), num_inputs, &beta, - ((sycl::half *)output_tensor), num_outputs); + num_outputs, N, num_inputs, &alpha, ((const hipblasHalf *)weights_), + num_inputs, ((const hipblasHalf *)input_tensor), num_inputs, &beta, + ((hipblasHalf *)output_tensor), num_outputs); hipStreamSynchronize(hipStreamHandle); }); @@ -964,7 +972,7 @@ template <> hipStreamSynchronize(hipStreamHandle); }); - ); + }); #else int64_t M_ = M; int64_t N_ = N; @@ -1807,7 +1815,20 @@ static void cublasXgemm(transpose_type transa, }); } #elif defined(USE_HIPBLAS) - hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); + hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); + if (fp16) { + unsigned short alpha_h = FP32toFP16(alpha); + unsigned short beta_h = FP32toFP16(beta); + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.host_task([=](sycl::interop_handle ih) { + auto hipStreamHandle = sycl::get_native(sycl_queue); + hipblasSetStream(handle, hipStreamHandle); + hipblasHgemm(handle, transa, transb, m, n, k, &alpha_h, (const hipblasHalf*)A, + lda, (const hipblasHalf*)B, ldb, &beta_h, (hipblasHalf*)C, ldc); + hipStreamSynchronize(hipStreamHandle); + }); + }); + } else { sycl_queue.submit([&](sycl::handler &cgh) { cgh.host_task([=](sycl::interop_handle ih) { auto hipStreamHandle = sycl::get_native(sycl_queue); @@ -1816,6 +1837,7 @@ static void cublasXgemm(transpose_type transa, hipStreamSynchronize(hipStreamHandle); }); }); + } #else oneapi::mkl::blas::column_major::gemm(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, (const DataType *)B, ldb, beta, (DataType *)C, ldc); @@ -1873,9 +1895,29 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran }); } #elif defined(USE_HIPBLAS) - hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); + hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); + if (fp16) { + unsigned short alpha_h = FP32toFP16(alpha); + unsigned short beta_h = FP32toFP16(beta); + + sycl_queue.submit([&](sycl::handler &cgh) { + + cgh.host_task([=](sycl::interop_handle ih) { + + auto hipStreamHandle = sycl::get_native(sycl_queue); + hipblasSetStream(handle, hipStreamHandle); + + hipblasGemmStridedBatchedEx( + handle, transa, transb, m, n, k, &alpha_h, A, HIPBLAS_R_16F, lda, strideA, B, + HIPBLAS_R_16F, ldb, strideB, &beta_h, C, HIPBLAS_R_16F, ldc, strideC, + batchCount, HIPBLAS_R_16F, HIPBLAS_GEMM_DEFAULT); + + hipStreamSynchronize(hipStreamHandle); - sycl_queue.submit([&](sycl::handler &cgh) { + }); + }); + } else { + sycl_queue.submit([&](sycl::handler &cgh) { cgh.host_task([=](sycl::interop_handle ih) { @@ -1891,9 +1933,10 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran }); }); - #else - oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount); - #endif + } + #else + oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount); + #endif } template @@ -1962,8 +2005,8 @@ static void cublasXGemmBatched(transpose_type transa, hipblasSetStream(handle, hipStreamHandle); hipblasHgemmBatched( - handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda, - (half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount); + handle, transa, transb, m, n, k, (const hipblasHalf*)&alpha_h, (hipblasHalf**)A, lda, + (hipblasHalf**)B, ldb, (const hipblasHalf*)&beta_h, (hipblasHalf**)C, ldc, batchCount); hipStreamSynchronize(hipStreamHandle); @@ -2507,7 +2550,6 @@ template AttentionBody::~AttentionBody() { sycl::free(ip_emb_w_, sycl_queue_); sycl::free(ip_emb_b_, sycl_queue_); - sycl::free(pos_encoding_, sycl_queue_); if (is_pe_dense_embedding_) { sycl::free(ip_emb_pre_w_, sycl_queue_); sycl::free(ip_emb_pre_b_, sycl_queue_); diff --git a/src/neural/backends/sycl/sycl_common.h b/src/neural/backends/sycl/sycl_common.h index 3a7e3b21b0..b0241da5e4 100644 --- a/src/neural/backends/sycl/sycl_common.h +++ b/src/neural/backends/sycl/sycl_common.h @@ -27,6 +27,12 @@ #include "utils/exception.h" +#if defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__)) +#define SYCL_SUB_GROUP_SIZE 64 +#else +#define SYCL_SUB_GROUP_SIZE 32 +#endif + namespace lczero { namespace sycldnn_backend {