-
Notifications
You must be signed in to change notification settings - Fork 589
Enable SYCL NVIDIA and AMD backends #2192
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,12 @@ | |
| #endif | ||
| #include "winograd_helper.h" | ||
|
|
||
| #if defined(__HIP_PLATFORM_AMD__) && (defined(__GFX9__) || defined(__GFX8__)) | ||
|
||
| #define SYCL_SUB_GROUP_SIZE 64 | ||
| #else | ||
| #define SYCL_SUB_GROUP_SIZE 32 | ||
| #endif | ||
|
|
||
| namespace lczero { | ||
| namespace sycldnn_backend { | ||
|
|
||
|
|
@@ -749,7 +755,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, | |
| sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C), | ||
| sycl::range<3>(1, 1, C)), | ||
| [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size( | ||
| 32)]] { | ||
| SYCL_SUB_GROUP_SIZE)]] { | ||
| OutputInputTransformKernel_fp16_shmem_board<activation, | ||
| use_bias, use_skip>( | ||
| N, C, se_K, (sycl::half*)output, (const sycl::half*)input, | ||
|
|
@@ -798,7 +804,7 @@ void OutputInputTransform(int N, int C, int se_K, T* output, const T* input, | |
| cgh.parallel_for( | ||
| sycl::nd_range<3>(sycl::range<3>(1, 1, N) * sycl::range<3>(1, 1, C), | ||
| sycl::range<3>(1, 1, C)), | ||
| [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { | ||
| [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(SYCL_SUB_GROUP_SIZE)]] { | ||
| OutputTransform_SE_relu_InputTransform_kernel< | ||
| sycl::half, activation, use_bias, use_skip>( | ||
| N, C, se_K, output, input, (sycl::half*)skip, bias, w1, b1, w2, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -358,6 +358,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu | |
| half alpha = one_h; | ||
| half beta = zero_h; | ||
|
|
||
| #elif defined(USE_HIPBLAS) | ||
| hipblasHalf alpha{1}; | ||
| hipblasHalf beta{0}; | ||
|
|
||
| #else | ||
| sycl::half alpha = 1; | ||
| sycl::half beta = 0; | ||
|
|
@@ -393,10 +397,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu | |
| sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue); | ||
| hipblasSetStream(handle, hipStreamHandle); | ||
|
|
||
| hipblasSgemm(handle, transpose_type_transpose, | ||
| hipblasHgemm(handle, transpose_type_transpose, | ||
| transpose_type_notranspose,numFc1Out_, N, C, &alpha, | ||
| ((const sycl::half *)w1_), C, ((const sycl::half *)op2), C, | ||
| &beta, ((sycl::half *)op1), numFc1Out_); | ||
| ((const hipblasHalf *)w1_), C, ((const hipblasHalf *)op2), C, | ||
| &beta, ((hipblasHalf *)op1), numFc1Out_); | ||
|
|
||
| hipStreamSynchronize(hipStreamHandle); | ||
| }); | ||
|
|
@@ -436,10 +440,10 @@ void SELayer<sycl::half>::Eval(int N, sycl::half* output, const sycl::half* inpu | |
| sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue); | ||
| hipblasSetStream(handle, hipStreamHandle); | ||
|
|
||
| hipblasSgemm( | ||
| hipblasHgemm( | ||
| handle, transpose_type_transpose, transpose_type_notranspose, 2 * C, | ||
| N, numFc1Out_, &alpha,((const sycl::half *)w2_), numFc1Out_, | ||
| ((const sycl::half *)op1), numFc1Out_, &beta, ((sycl::half *)op2), | ||
| N, numFc1Out_, &alpha,((const hipblasHalf *)w2_), numFc1Out_, | ||
| ((const hipblasHalf *)op1), numFc1Out_, &beta, ((hipblasHalf *)op2), | ||
| 2 * C); | ||
|
|
||
| hipStreamSynchronize(hipStreamHandle); | ||
|
|
@@ -544,6 +548,10 @@ template <> | |
| half alpha = one_h; | ||
| half beta = zero_h; | ||
|
|
||
| #elif defined(USE_HIPBLAS) | ||
| hipblasHalf alpha{1}; | ||
| hipblasHalf beta{0}; | ||
|
|
||
| #else | ||
| sycl::half alpha = 1; | ||
| sycl::half beta = 0; | ||
|
|
@@ -576,11 +584,11 @@ template <> | |
| sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue); | ||
| hipblasSetStream(handle, hipStreamHandle); | ||
|
|
||
| hipblasSgemm( | ||
| hipblasHgemm( | ||
| handle, transpose_type_transpose, transpose_type_notranspose, | ||
| num_outputs, N, num_inputs, &alpha, ((const sycl::half *)weights_), | ||
| num_inputs, ((const sycl::half *)input_tensor), num_inputs, &beta, | ||
| ((sycl::half *)output_tensor), num_outputs); | ||
| num_outputs, N, num_inputs, &alpha, ((const hipblasHalf *)weights_), | ||
| num_inputs, ((const hipblasHalf *)input_tensor), num_inputs, &beta, | ||
| ((hipblasHalf *)output_tensor), num_outputs); | ||
|
|
||
| hipStreamSynchronize(hipStreamHandle); | ||
| }); | ||
|
|
@@ -964,7 +972,7 @@ template <> | |
|
|
||
| hipStreamSynchronize(hipStreamHandle); | ||
| }); | ||
| ); | ||
| }); | ||
| #else | ||
| int64_t M_ = M; | ||
| int64_t N_ = N; | ||
|
|
@@ -1807,7 +1815,20 @@ static void cublasXgemm(transpose_type transa, | |
| }); | ||
| } | ||
| #elif defined(USE_HIPBLAS) | ||
| hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); | ||
| hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); | ||
| if (fp16) { | ||
|
||
| unsigned short alpha_h = FP32toFP16(alpha); | ||
| unsigned short beta_h = FP32toFP16(beta); | ||
| sycl_queue.submit([&](sycl::handler &cgh) { | ||
| cgh.host_task([=](sycl::interop_handle ih) { | ||
| auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue); | ||
| hipblasSetStream(handle, hipStreamHandle); | ||
| hipblasHgemm(handle, transa, transb, m, n, k, &alpha_h, (const hipblasHalf*)A, | ||
| lda, (const hipblasHalf*)B, ldb, &beta_h, (hipblasHalf*)C, ldc); | ||
| hipStreamSynchronize(hipStreamHandle); | ||
| }); | ||
| }); | ||
| } else { | ||
| sycl_queue.submit([&](sycl::handler &cgh) { | ||
| cgh.host_task([=](sycl::interop_handle ih) { | ||
| auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue); | ||
|
|
@@ -1816,6 +1837,7 @@ static void cublasXgemm(transpose_type transa, | |
| hipStreamSynchronize(hipStreamHandle); | ||
| }); | ||
| }); | ||
| } | ||
| #else | ||
| oneapi::mkl::blas::column_major::gemm(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, | ||
| (const DataType *)B, ldb, beta, (DataType *)C, ldc); | ||
|
|
@@ -1873,9 +1895,29 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran | |
| }); | ||
| } | ||
| #elif defined(USE_HIPBLAS) | ||
| hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); | ||
| hipblasHandle_t handle = hipBlasContextManager::gethipBlasHandle_t(); | ||
| if (fp16) { | ||
| unsigned short alpha_h = FP32toFP16(alpha); | ||
| unsigned short beta_h = FP32toFP16(beta); | ||
|
|
||
| sycl_queue.submit([&](sycl::handler &cgh) { | ||
|
|
||
| cgh.host_task([=](sycl::interop_handle ih) { | ||
|
|
||
| auto hipStreamHandle = sycl::get_native<sycl::backend::ext_oneapi_hip>(sycl_queue); | ||
| hipblasSetStream(handle, hipStreamHandle); | ||
|
|
||
| hipblasGemmStridedBatchedEx( | ||
| handle, transa, transb, m, n, k, &alpha_h, A, HIPBLAS_R_16F, lda, strideA, B, | ||
| HIPBLAS_R_16F, ldb, strideB, &beta_h, C, HIPBLAS_R_16F, ldc, strideC, | ||
| batchCount, HIPBLAS_R_16F, HIPBLAS_GEMM_DEFAULT); | ||
|
|
||
| hipStreamSynchronize(hipStreamHandle); | ||
|
|
||
| sycl_queue.submit([&](sycl::handler &cgh) { | ||
| }); | ||
| }); | ||
| } else { | ||
| sycl_queue.submit([&](sycl::handler &cgh) { | ||
|
|
||
| cgh.host_task([=](sycl::interop_handle ih) { | ||
|
|
||
|
|
@@ -1891,9 +1933,10 @@ static void cublasXGemmStridedBatched(transpose_type transa, transpose_type tran | |
|
|
||
| }); | ||
| }); | ||
| #else | ||
| oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount); | ||
| #endif | ||
| } | ||
| #else | ||
| oneapi::mkl::blas::column_major::gemm_batch(sycl_queue, transa, transb, m, n, k, alpha, (const DataType *)A, lda, strideA, (const DataType *)B, ldb, strideB, beta, (DataType *)C, ldc, strideC, batchCount); | ||
| #endif | ||
| } | ||
|
|
||
| template <typename DataType> | ||
|
|
@@ -1962,8 +2005,8 @@ static void cublasXGemmBatched(transpose_type transa, | |
| hipblasSetStream(handle, hipStreamHandle); | ||
|
|
||
| hipblasHgemmBatched( | ||
| handle, transa, transb, m, n, k, (const half*)&alpha_h, (half**)A, lda, | ||
| (half**)B, ldb, (const half*)&beta_h, (half**)C, ldc, batchCount); | ||
| handle, transa, transb, m, n, k, (const hipblasHalf*)&alpha_h, (hipblasHalf**)A, lda, | ||
| (hipblasHalf**)B, ldb, (const hipblasHalf*)&beta_h, (hipblasHalf**)C, ldc, batchCount); | ||
|
|
||
| hipStreamSynchronize(hipStreamHandle); | ||
|
|
||
|
|
@@ -2507,7 +2550,6 @@ template <typename DataType> | |
| AttentionBody<DataType>::~AttentionBody() { | ||
| sycl::free(ip_emb_w_, sycl_queue_); | ||
| sycl::free(ip_emb_b_, sycl_queue_); | ||
| sycl::free(pos_encoding_, sycl_queue_); | ||
| if (is_pe_dense_embedding_) { | ||
| sycl::free(ip_emb_pre_w_, sycl_queue_); | ||
| sycl::free(ip_emb_pre_b_, sycl_queue_); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] The
SYCL_SUB_GROUP_SIZEmacro is defined locally here; since it’s duplicated in multiple files, consider extracting it to a shared header to avoid divergence and improve maintainability.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good point, moved to
sycl_common.hThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it possible that we may also need to set
SYCL_SUB_GROUP_SIZEfor future architectures?Then it may make sense to use this pattern:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you'd like an interesting read, the full context for this very specific definition is ROCm/ROCm#4121 where an AMD engineer recommended this claiming it "will work without needing revisiting in the foreseeable future". I also can't imagine the other major vendors not supporting a sub-group size of 32 any time soon. Supporting other SYCL devices than AMD/Intel/NVIDIA GPUs would require big changes to the code as it is currently, so I'm quite confident this won't be needed for now. I would suggest implementing this pattern if/when a need for this comes up, but if you think it's useful I don't mind adding it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems fine then.