diff --git a/cupy_backends/cuda/libs/cudnn.pxd b/cupy_backends/cuda/libs/cudnn.pxd index 5f7430ab3f6..9b2df6ca5bd 100644 --- a/cupy_backends/cuda/libs/cudnn.pxd +++ b/cupy_backends/cuda/libs/cudnn.pxd @@ -6,19 +6,9 @@ from libc.stdint cimport intptr_t ############################################################################### cpdef enum: - CUDNN_DATA_FLOAT = 0 - CUDNN_DATA_DOUBLE = 1 - CUDNN_DATA_HALF = 2 - - CUDNN_DEFAULT_MATH = 0 - CUDNN_TENSOR_OP_MATH = 1 - CUDNN_NOT_PROPAGATE_NAN = 0 CUDNN_PROPAGATE_NAN = 1 - CUDNN_NON_DETERMINISTIC = 0 - CUDNN_DETERMINISTIC = 1 - CUDNN_TENSOR_NCHW = 0 CUDNN_TENSOR_NHWC = 1 @@ -26,8 +16,6 @@ cpdef enum: CUDNN_OP_TENSOR_MUL = 1 CUDNN_OP_TENSOR_MIN = 2 CUDNN_OP_TENSOR_MAX = 3 - CUDNN_OP_TENSOR_SQRT = 4 - CUDNN_OP_TENSOR_NOT = 5 CUDNN_REDUCE_TENSOR_ADD = 0 CUDNN_REDUCE_TENSOR_MUL = 1 @@ -37,7 +25,6 @@ cpdef enum: CUDNN_REDUCE_TENSOR_AVG = 5 CUDNN_REDUCE_TENSOR_NORM1 = 6 CUDNN_REDUCE_TENSOR_NORM2 = 7 - CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8 CUDNN_REDUCE_TENSOR_NO_INDICES = 0 CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1 @@ -47,51 +34,10 @@ cpdef enum: CUDNN_16BIT_INDICES = 2 CUDNN_8BIT_INDICES = 3 - CUDNN_ADD_IMAGE = 0 - CUDNN_ADD_SAME_HW = 0 - CUDNN_ADD_FEATURE_MAP = 1 - CUDNN_ADD_SAME_CHW = 1 - CUDNN_ADD_SAME_C = 2 - CUDNN_ADD_FULL_TENSOR = 3 - + # TODO Confirm from miopen team CUDNN_CONVOLUTION = 0 CUDNN_CROSS_CORRELATION = 1 - CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0 - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1 - CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3 - CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4 - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6 - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7 - - CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4 - CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5 - - CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0 - CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1 - CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2 - - CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4 - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5 - CUDNN_SOFTMAX_FAST = 0 CUDNN_SOFTMAX_ACCURATE = 1 CUDNN_SOFTMAX_LOG = 2 @@ -99,32 +45,10 @@ cpdef enum: CUDNN_SOFTMAX_MODE_INSTANCE = 0 CUDNN_SOFTMAX_MODE_CHANNEL = 1 - CUDNN_POOLING_MAX = 0 - CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1 - CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2 - CUDNN_POOLING_MAX_DETERMINISTIC = 3 - - CUDNN_ACTIVATION_SIGMOID = 0 - CUDNN_ACTIVATION_RELU = 1 - CUDNN_ACTIVATION_TANH = 2 - CUDNN_ACTIVATION_CLIPPED_RELU = 3 - CUDNN_ACTIVATION_ELU = 4 - CUDNN_ACTIVATION_IDENTITY = 5 - - CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0 - - CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0 - CUDNN_BATCHNORM_PER_ACTIVATION = 0 CUDNN_BATCHNORM_SPATIAL = 1 - CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2 CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0 - CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 - - CUDNN_BATCHNORM_OPS_BN = 0 - CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1 - CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2 CUDNN_RNN_RELU = 0 CUDNN_RNN_TANH = 1 @@ -134,116 +58,226 @@ cpdef enum: CUDNN_UNIDIRECTIONAL = 0 CUDNN_BIDIRECTIONAL = 1 - CUDNN_RNN_ALGO_STANDARD = 0 - CUDNN_RNN_ALGO_PERSIST_STATIC = 1 - CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 - - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0 - CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1 - CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2 - CUDNN_RNN_PADDED_IO_DISABLED = 0 CUDNN_RNN_PADDED_IO_ENABLED = 1 CUDNN_LINEAR_INPUT = 0 CUDNN_SKIP_INPUT = 1 - CUDNN_SAMPLER_BILINEAR = 0 - CUDNN_STATUS_SUCCESS = 0 - CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11 - CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12 - CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13 - - CUDNN_ERRQUERY_RAWCODE = 0 - CUDNN_ERRQUERY_NONBLOCKING = 1 - CUDNN_ERRQUERY_BLOCKING = 2 - - # cudnnFusedOps_t - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0 - CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2 - CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3 - CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4 - CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5 - CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6 - - # cudnnFusedOpsConstParamLabel_t - CUDNN_PARAM_XDESC = 0 - CUDNN_PARAM_XDATA_PLACEHOLDER = 1 - CUDNN_PARAM_BN_MODE = 2 - CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3 - CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4 - CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5 - CUDNN_PARAM_ACTIVATION_DESC = 6 - CUDNN_PARAM_CONV_DESC = 7 - CUDNN_PARAM_WDESC = 8 - CUDNN_PARAM_WDATA_PLACEHOLDER = 9 - CUDNN_PARAM_DWDESC = 10 - CUDNN_PARAM_DWDATA_PLACEHOLDER = 11 - CUDNN_PARAM_YDESC = 12 - CUDNN_PARAM_YDATA_PLACEHOLDER = 13 - CUDNN_PARAM_DYDESC = 14 - CUDNN_PARAM_DYDATA_PLACEHOLDER = 15 - CUDNN_PARAM_YSTATS_DESC = 16 - CUDNN_PARAM_YSUM_PLACEHOLDER = 17 - CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18 - CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19 - CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20 - CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21 - CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22 - CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23 - CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24 - CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25 - CUDNN_PARAM_ZDESC = 26 - CUDNN_PARAM_ZDATA_PLACEHOLDER = 27 - CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28 - CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29 - CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30 - CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31 - CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32 - CUDNN_PARAM_DXDESC = 33 - CUDNN_PARAM_DXDATA_PLACEHOLDER = 34 - CUDNN_PARAM_DZDESC = 35 - CUDNN_PARAM_DZDATA_PLACEHOLDER = 36 - CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37 - CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38 - - # cudnnFusedOpsPointerPlaceHolder_t - CUDNN_PTR_NULL = 0 - CUDNN_PTR_ELEM_ALIGNED = 1 - CUDNN_PTR_16B_ALIGNED = 2 - - # cudnnFusedOpsVariantParamLabel_t - CUDNN_PTR_XDATA = 0 - CUDNN_PTR_BN_EQSCALE = 1 - CUDNN_PTR_BN_EQBIAS = 2 - CUDNN_PTR_WDATA = 3 - CUDNN_PTR_DWDATA = 4 - CUDNN_PTR_YDATA = 5 - CUDNN_PTR_DYDATA = 6 - CUDNN_PTR_YSUM = 7 - CUDNN_PTR_YSQSUM = 8 - CUDNN_PTR_WORKSPACE = 9 - CUDNN_PTR_BN_SCALE = 10 - CUDNN_PTR_BN_BIAS = 11 - CUDNN_PTR_BN_SAVED_MEAN = 12 - CUDNN_PTR_BN_SAVED_INVSTD = 13 - CUDNN_PTR_BN_RUNNING_MEAN = 14 - CUDNN_PTR_BN_RUNNING_VAR = 15 - CUDNN_PTR_ZDATA = 16 - CUDNN_PTR_BN_Z_EQSCALE = 17 - CUDNN_PTR_BN_Z_EQBIAS = 18 - CUDNN_PTR_ACTIVATION_BITMASK = 19 - CUDNN_PTR_DXDATA = 20 - CUDNN_PTR_DZDATA = 21 - CUDNN_PTR_BN_DSCALE = 22 - CUDNN_PTR_BN_DBIAS = 23 - CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100 - CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101 - CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102 - CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103 - +IF CUPY_HIP_VERSION > 0: + cpdef enum: + CUDNN_DATA_FLOAT = 1 + CUDNN_DATA_DOUBLE = 6 + CUDNN_DATA_HALF = 0 + + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 5 + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 1 + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 2 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 3 + + CUDNN_POOLING_MAX = 0 + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 2 + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 1 + + CUDNN_ACTIVATION_RELU = 3 + CUDNN_ACTIVATION_TANH = 2 + CUDNN_ACTIVATION_CLIPPED_RELU = 7 + CUDNN_ACTIVATION_ELU = 9 + CUDNN_ACTIVATION_IDENTITY = 0 + + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 1 + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 2 + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 3 +ELSE: + cpdef enum: + CUDNN_DATA_FLOAT = 0 + CUDNN_DATA_DOUBLE = 1 + CUDNN_DATA_HALF = 2 + + CUDNN_DEFAULT_MATH = 0 + CUDNN_TENSOR_OP_MATH = 1 + + CUDNN_NON_DETERMINISTIC = 0 + CUDNN_DETERMINISTIC = 1 + + CUDNN_OP_TENSOR_SQRT = 4 + CUDNN_OP_TENSOR_NOT = 5 + + CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8 + + CUDNN_ADD_IMAGE = 0 + CUDNN_ADD_SAME_HW = 0 + CUDNN_ADD_FEATURE_MAP = 1 + CUDNN_ADD_SAME_CHW = 1 + CUDNN_ADD_SAME_C = 2 + CUDNN_ADD_FULL_TENSOR = 3 + + CUDNN_CONVOLUTION = 0 + CUDNN_CROSS_CORRELATION = 1 + + CUDNN_CONVOLUTION_FWD_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_FWD_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM = 0 + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1 + CUDNN_CONVOLUTION_FWD_ALGO_GEMM = 2 + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT = 3 + CUDNN_CONVOLUTION_FWD_ALGO_FFT = 4 + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING = 5 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD = 6 + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED = 7 + + CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0 = 0 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 = 1 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT = 2 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3 = 3 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD = 4 + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5 + + CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE = 0 + CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST = 1 + CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT = 2 + + CUDNN_CONVOLUTION_BWD_DATA_ALGO_0 = 0 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 = 1 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT = 2 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING = 3 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD = 4 + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5 + + CUDNN_POOLING_MAX = 0 + CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1 + CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2 + CUDNN_POOLING_MAX_DETERMINISTIC = 3 + + CUDNN_ACTIVATION_SIGMOID = 0 + CUDNN_ACTIVATION_RELU = 1 + CUDNN_ACTIVATION_TANH = 2 + CUDNN_ACTIVATION_CLIPPED_RELU = 3 + CUDNN_ACTIVATION_ELU = 4 + CUDNN_ACTIVATION_IDENTITY = 5 + + CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0 + + CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0 + + CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2 + + CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 + + CUDNN_BATCHNORM_OPS_BN = 0 + CUDNN_BATCHNORM_OPS_BN_ACTIVATION = 1 + CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2 + + CUDNN_RNN_ALGO_STANDARD = 0 + CUDNN_RNN_ALGO_PERSIST_STATIC = 1 + CUDNN_RNN_ALGO_PERSIST_DYNAMIC = 2 + + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0 + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1 + CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2 + + CUDNN_SAMPLER_BILINEAR = 0 + + CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING = 11 + CUDNN_STATUS_RUNTIME_IN_PROGRESS = 12 + CUDNN_STATUS_RUNTIME_FP_OVERFLOW = 13 + + CUDNN_ERRQUERY_RAWCODE = 0 + CUDNN_ERRQUERY_NONBLOCKING = 1 + CUDNN_ERRQUERY_BLOCKING = 2 + + # cudnnFusedOps_t + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS = 0 + CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD = 1 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING = 2 + CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE = 3 + CUDNN_FUSED_CONV_SCALE_BIAS_ADD_ACTIVATION = 4 + CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK = 5 + CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM = 6 + + # cudnnFusedOpsConstParamLabel_t + CUDNN_PARAM_XDESC = 0 + CUDNN_PARAM_XDATA_PLACEHOLDER = 1 + CUDNN_PARAM_BN_MODE = 2 + CUDNN_PARAM_BN_EQSCALEBIAS_DESC = 3 + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER = 4 + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER = 5 + CUDNN_PARAM_ACTIVATION_DESC = 6 + CUDNN_PARAM_CONV_DESC = 7 + CUDNN_PARAM_WDESC = 8 + CUDNN_PARAM_WDATA_PLACEHOLDER = 9 + CUDNN_PARAM_DWDESC = 10 + CUDNN_PARAM_DWDATA_PLACEHOLDER = 11 + CUDNN_PARAM_YDESC = 12 + CUDNN_PARAM_YDATA_PLACEHOLDER = 13 + CUDNN_PARAM_DYDESC = 14 + CUDNN_PARAM_DYDATA_PLACEHOLDER = 15 + CUDNN_PARAM_YSTATS_DESC = 16 + CUDNN_PARAM_YSUM_PLACEHOLDER = 17 + CUDNN_PARAM_YSQSUM_PLACEHOLDER = 18 + CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC = 19 + CUDNN_PARAM_BN_SCALE_PLACEHOLDER = 20 + CUDNN_PARAM_BN_BIAS_PLACEHOLDER = 21 + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER = 22 + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER = 23 + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER = 24 + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER = 25 + CUDNN_PARAM_ZDESC = 26 + CUDNN_PARAM_ZDATA_PLACEHOLDER = 27 + CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC = 28 + CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER = 29 + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER = 30 + CUDNN_PARAM_ACTIVATION_BITMASK_DESC = 31 + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER = 32 + CUDNN_PARAM_DXDESC = 33 + CUDNN_PARAM_DXDATA_PLACEHOLDER = 34 + CUDNN_PARAM_DZDESC = 35 + CUDNN_PARAM_DZDATA_PLACEHOLDER = 36 + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER = 37 + CUDNN_PARAM_BN_DBIAS_PLACEHOLDER = 38 + + # cudnnFusedOpsPointerPlaceHolder_t + CUDNN_PTR_NULL = 0 + CUDNN_PTR_ELEM_ALIGNED = 1 + CUDNN_PTR_16B_ALIGNED = 2 + + # cudnnFusedOpsVariantParamLabel_t + CUDNN_PTR_XDATA = 0 + CUDNN_PTR_BN_EQSCALE = 1 + CUDNN_PTR_BN_EQBIAS = 2 + CUDNN_PTR_WDATA = 3 + CUDNN_PTR_DWDATA = 4 + CUDNN_PTR_YDATA = 5 + CUDNN_PTR_DYDATA = 6 + CUDNN_PTR_YSUM = 7 + CUDNN_PTR_YSQSUM = 8 + CUDNN_PTR_WORKSPACE = 9 + CUDNN_PTR_BN_SCALE = 10 + CUDNN_PTR_BN_BIAS = 11 + CUDNN_PTR_BN_SAVED_MEAN = 12 + CUDNN_PTR_BN_SAVED_INVSTD = 13 + CUDNN_PTR_BN_RUNNING_MEAN = 14 + CUDNN_PTR_BN_RUNNING_VAR = 15 + CUDNN_PTR_ZDATA = 16 + CUDNN_PTR_BN_Z_EQSCALE = 17 + CUDNN_PTR_BN_Z_EQBIAS = 18 + CUDNN_PTR_ACTIVATION_BITMASK = 19 + CUDNN_PTR_DXDATA = 20 + CUDNN_PTR_DZDATA = 21 + CUDNN_PTR_BN_DSCALE = 22 + CUDNN_PTR_BN_DBIAS = 23 + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES = 100 + CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT = 101 + CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR = 102 + CUDNN_SCALAR_DOUBLE_BN_EPSILON = 103 ############################################################################### # Class @@ -258,18 +292,11 @@ cdef class CuDNNAlgoPerf: int determinism int mathType - ############################################################################### # Version ############################################################################### cpdef size_t getVersion() except? 0 - -############################################################################### -# Runtime error checking -############################################################################### -cpdef queryRuntimeError(intptr_t handle, int mode) - ############################################################################### # Initialization and CUDA cooperation ############################################################################### @@ -279,7 +306,6 @@ cpdef destroy(intptr_t handle) cpdef setStream(intptr_t handle, size_t stream) cpdef size_t getStream(intptr_t handle) except? 0 - ############################################################################### # Tensor manipulation ############################################################################### @@ -291,280 +317,7 @@ cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride) cpdef tuple getTensor4dDescriptor(size_t tensorDesc) -cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, - size_t dimA, size_t strideA) cpdef destroyTensorDescriptor(size_t tensorDesc) -cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, - size_t b, size_t beta, size_t yDesc, size_t y) - - -############################################################################### -# Tensor operations -############################################################################### - -cpdef size_t createOpTensorDescriptor() except? 0 -cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, - int opTensorCompType, int opTensorNanOpt) -cpdef getOpTensorDescriptor(size_t opTensorDesc) -cpdef destroyOpTensorDescriptor(size_t opTensorDesc) -cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, - size_t aDesc, size_t A, size_t alpha2, size_t bDesc, - size_t B, size_t beta, size_t cDesc, size_t C) - - -############################################################################### -# Tensor reductions -############################################################################### - -cpdef size_t createReduceTensorDescriptor() except? 0 -cpdef setReduceTensorDescriptor( - size_t reduceTensorDesc, int reduceTensorOp, - int reduceTensorCompType, int reduceTensorNanOpt, - int reduceTensorIndices, int reduceTensorIndicesType) -cpdef getReduceTensorDescriptor(size_t reduceTensorDesc) -cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc) -cpdef size_t getReductionIndicesSize( - intptr_t handle, size_t reduceTensorDesc, size_t aDesc, - size_t cDesc) except? 0 -cpdef size_t getReductionWorkspaceSize( - intptr_t handle, size_t reduceTensorDesc, size_t aDesc, - size_t cDesc) except? 0 -cpdef reduceTensor( - intptr_t handle, size_t reduceTensorDesc, size_t indices, - size_t indicesSizeInBytes, size_t workspace, - size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, - size_t A, size_t beta, size_t cDesc, size_t C) -cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr) -cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha) - - -############################################################################### -# Filter manipulation -############################################################################### - -cpdef size_t createFilterDescriptor() except? 0 -cpdef setFilter4dDescriptor_v4( - size_t filterDesc, int dataType, int format, int k, int c, int h, int w) -cpdef setFilterNdDescriptor_v4( - size_t filterDesc, int dataType, int format, int nbDims, size_t filterDimA) -cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested) -cpdef destroyFilterDescriptor(size_t filterDesc) - - -############################################################################### -# Convolution -############################################################################### - -cpdef size_t createConvolutionDescriptor() except? 0 -cpdef setConvolutionMathType( - size_t convDesc, size_t mathType) -cpdef size_t getConvolutionMathType(size_t convDesc) except? 0 -cpdef setConvolutionGroupCount( - size_t convDesc, int groupCount) -cpdef int getConvolutionGroupCount(size_t convDesc) except? -1 -cpdef setConvolution2dDescriptor_v4( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode) -cpdef setConvolution2dDescriptor_v5( - size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, - int dilation_w, int mode, size_t computeType) -cpdef setConvolutionNdDescriptor_v3( - size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, - size_t dilationA, int mode, int dataType) -cpdef destroyConvolutionDescriptor(size_t convDesc) -cpdef findConvolutionForwardAlgorithm( - intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, size_t yDesc, - int requestedAlgoCount) -cpdef list findConvolutionForwardAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef list findConvolutionForwardAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, - size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef int getConvolutionForwardAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1 -cpdef list getConvolutionForwardAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int requestedAlgoCount) -cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int algo) except? -1 -cpdef convolutionForward( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t filterDesc, size_t filterData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t destDesc, size_t destData) -cpdef convolutionBackwardBias( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t beta, size_t destDesc, size_t destData) -cpdef findConvolutionBackwardFilterAlgorithm( - intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, - size_t dwDesc, int requestedAlgoCount) -cpdef list findConvolutionBackwardFilterAlgorithmEx( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( - intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, - size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef int getConvolutionBackwardFilterAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int preference, size_t memoryLimitInbytes) except? -1 -cpdef list getConvolutionBackwardFilterAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount) -cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( - intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, - size_t filterDesc, int algo) except? -1 -cpdef convolutionBackwardFilter_v3( - intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData) -cpdef findConvolutionBackwardDataAlgorithm( - intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, - size_t dxDesc, int requestedAlgoCount) -cpdef list findConvolutionBackwardDataAlgorithmEx( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) -cpdef list findConvolutionBackwardDataAlgorithmEx_v7( - intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, - size_t convDesc, size_t dxDesc, size_t dx, - int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) -cpdef int getConvolutionBackwardDataAlgorithm_v6( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, size_t preference, - size_t memoryLimitInbytes) except? -1 -cpdef list getConvolutionBackwardDataAlgorithm_v7( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int requestedAlgoCount) -cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( - intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, - size_t gradDesc, int algo) except? -1 -cpdef convolutionBackwardData_v3( - intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, - size_t diffDesc, size_t diffData, size_t convDesc, int algo, - size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, - size_t gradDesc, size_t gradData) - - -############################################################################### -# Pooling -############################################################################### - -cpdef size_t createPoolingDescriptor() except? 0 -cpdef setPooling2dDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, - int windowWidth, int verticalPadding, int horizontalPadding, - int verticalStride, int horizontalStride) -cpdef setPoolingNdDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, - size_t windowDimA, size_t paddingA, size_t strideA) -cpdef destroyPoolingDescriptor(size_t poolingDesc) -cpdef poolingForward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData) -cpdef poolingBackward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData) - -############################################################################### -# Batch Normalization -############################################################################### - -cpdef deriveBNTensorDescriptor( - size_t derivedBnDesc, size_t xDesc, int mode) - -cpdef batchNormalizationForwardTraining( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance) - -cpdef batchNormalizationForwardInference( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, size_t estimatedMean, size_t estimatedVariance, - double epsilon) - -cpdef batchNormalizationBackward( - intptr_t handle, int mode, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, size_t dyDesc, - size_t dy, size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, size_t bnScale, - size_t dBnScaleResult, size_t dBnBiasResult, - double epsilon, size_t savedMean, size_t savedInvVariance) - -cpdef batchNormalizationForwardTrainingEx( - intptr_t handle, int mode, int bnOps, - size_t alpha, size_t beta, - size_t xDesc, size_t x, - size_t zDesc, size_t z, - size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, - size_t bnScale, size_t bnBias, - double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - -cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t zDesc, - size_t yDesc, - size_t bnScaleBiasMeanVarDesc, - size_t activationDesc) except? 0 - -cpdef batchNormalizationBackwardEx( - intptr_t handle, int mode, int bnops, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, - size_t yDesc, size_t y, - size_t dyDesc, size_t dy, - size_t dzDesc, size_t dz, - size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, - size_t bnScaleData, size_t bnBiasData, - size_t dBnScaleData, size_t dBnBiasData, - double epsilon, - size_t savedMean, size_t savedInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - -cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t yDesc, - size_t dyDesc, - size_t dzDesc, - size_t dxDesc, - size_t dBnScaleBiasDesc, - size_t activationDesc) except? 0 - -cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( - intptr_t handle, int mode, int bnOps, - size_t activationDesc, - size_t xDesc) except? 0 - ############################################################################### # Activation @@ -591,188 +344,505 @@ cpdef activationBackward_v4( size_t destDiffData) -############################################################################### -# Dropout -############################################################################### -cpdef size_t createDropoutDescriptor() except? 0 -cpdef destroyDropoutDescriptor(size_t dropoutDesc) -cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1 -cpdef setDropoutDescriptor( - size_t dropoutDesc, intptr_t handle, float dropout, - size_t states, size_t stateSizeInBytes, unsigned long long seed) -cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0 -cpdef dropoutForward( - intptr_t handle, size_t dropoutDesc, - size_t srcDesc, size_t srcData, - size_t dstDesc, size_t dstData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef dropoutBackward( - intptr_t handle, size_t dropoutDesc, - size_t dyDesc, size_t dyData, - size_t dxtDesc, size_t dxData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - - -############################################################################### -# CTC -############################################################################### - -cpdef size_t createCTCLossDescriptor() except? 0 -cpdef destroyCTCLossDescriptor(size_t ctcLossDesc) -cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType) -cpdef getCTCLossDescriptor(size_t ctcLossDesc) -cpdef size_t getCTCLossWorkspaceSize( - intptr_t handle, size_t probsDesc, size_t gradientsDesc, - size_t labels, size_t labelLengths, size_t inputLengths, - int algo, size_t ctcLossDesc) except? 0 -cpdef CTCLoss( - intptr_t handle, size_t probsDesc, - size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, - size_t costs, size_t gradientsDesc, size_t gradients, int algo, - size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes) - - -############################################################################### -# RNN -############################################################################### - -cpdef size_t createRNNDescriptor() except? 0 -cpdef destroyRNNDescriptor(size_t rnnDesc) -cpdef size_t createPersistentRNNPlan( - size_t rnnDesc, int minibatch, int dataType) except? 0 -cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan) -cpdef destroyPersistentRNNPlan(size_t plan) -cpdef setRNNDescriptor_v5( - size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int dataType) -cpdef setRNNDescriptor_v6( - intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int algo, int dataType) -cpdef setRNNPaddingMode(size_t rnnDesc, int paddingMode) -cpdef getRNNPaddingMode(size_t rnnDesc) -cpdef size_t createRNNDataDescriptor() except? 0 -cpdef destroyRNNDataDescriptor(size_t RNNDataDesc) -cpdef setRNNDataDescriptor( - size_t RNNDataDesc, int dataType, size_t layout, - int maxSeqLength, int batchSize, int vectorSize, - size_t seqLengthArray, size_t paddingFill) -cpdef getRNNDataDescriptor( - size_t RNNDataDesc, size_t dataType, - size_t layout, size_t maxSeqLength, size_t batchSize, - size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, - size_t paddingFill) -cpdef getRNNWorkspaceSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) -cpdef getRNNTrainingReserveSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) -cpdef getRNNParamsSize( - intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType) -cpdef getRNNLinLayerMatrixParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat) -cpdef getRNNLinLayerBiasParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerBiasDesc, - size_t linLayerBias) -cpdef RNNForwardInference( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, - size_t x, size_t hxDesc, size_t hx, size_t cxDesc, - size_t cx, size_t wDesc, size_t w, size_t yDesc, - size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t workspace, size_t workSpaceSizeInBytes) -cpdef RNNForwardTraining( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t wDesc, size_t w, size_t yDesc, size_t y, - size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, - size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardData( - intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, - size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, - size_t dcxDesc, size_t dcx, size_t workspace, - size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardWeights( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, - size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef RNNForwardInferenceEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes) -cpdef RNNForwardTrainingEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardDataEx( - intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, - size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, - size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, - size_t dkDesc, size_t dkeys, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) -cpdef RNNBackwardWeightsEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t dwDesc, size_t dw, - size_t reserveSpace, size_t reserveSpaceSizeInBytes) - - -############################################################################### -# Spatial Transformer -############################################################################### - -cpdef size_t createSpatialTransformerDescriptor() except? 0 -cpdef destroySpatialTransformerDescriptor(size_t stDesc) -cpdef setSpatialTransformerDescriptor( - size_t stDesc, size_t samplerType, int dataType, - int nbDims, size_t dimA) -cpdef spatialTfGridGeneratorForward( - intptr_t handle, size_t stDesc, size_t theta, size_t grid) -cpdef spatialTfGridGeneratorBackward( - intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta) -cpdef spatialTfSamplerForward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t grid, size_t beta, size_t yDesc, size_t y) -cpdef spatialTfSamplerBackward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, - size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid) - -############################################################################### -# Fused Ops -############################################################################### - -cpdef createFusedOpsConstParamPack(int ops) -cpdef destroyFusedOpsConstParamPack(size_t constPack) -cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param) -cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param) -cpdef createFusedOpsVariantParamPack(int ops) -cpdef destroyFusedOpsVariantParamPack(size_t varPack) -cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr) -cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr) -cpdef createFusedOpsPlan(int ops) -cpdef destroyFusedOpsPlan(size_t plan) -cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack) -cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack) +IF CUPY_HIP_VERSION == 0: + ############################################################################### + # Runtime error checking + ############################################################################### + cpdef queryRuntimeError(intptr_t handle, int mode) + + ############################################################################### + # Tensor manipulation + ############################################################################### + + cpdef size_t createTensorDescriptor() except? 0 + cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, + int n, int c, int h, int w) + cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, + int n, int c, int h, int w, int nStride, + int cStride, int hStride, int wStride) + cpdef tuple getTensor4dDescriptor(size_t tensorDesc) + cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, + size_t dimA, size_t strideA) + cpdef destroyTensorDescriptor(size_t tensorDesc) + cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, + size_t b, size_t beta, size_t yDesc, size_t y) + + + ############################################################################### + # Tensor operations + ############################################################################### + + cpdef size_t createOpTensorDescriptor() except? 0 + cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, + int opTensorCompType, int opTensorNanOpt) + cpdef getOpTensorDescriptor(size_t opTensorDesc) + cpdef destroyOpTensorDescriptor(size_t opTensorDesc) + cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, + size_t aDesc, size_t A, size_t alpha2, size_t bDesc, + size_t B, size_t beta, size_t cDesc, size_t C) + + + ############################################################################### + # Tensor reductions + ############################################################################### + + cpdef size_t createReduceTensorDescriptor() except? 0 + cpdef setReduceTensorDescriptor( + size_t reduceTensorDesc, int reduceTensorOp, + int reduceTensorCompType, int reduceTensorNanOpt, + int reduceTensorIndices, int reduceTensorIndicesType) + cpdef getReduceTensorDescriptor(size_t reduceTensorDesc) + cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc) + cpdef size_t getReductionIndicesSize( + intptr_t handle, size_t reduceTensorDesc, size_t aDesc, + size_t cDesc) except? 0 + cpdef size_t getReductionWorkspaceSize( + intptr_t handle, size_t reduceTensorDesc, size_t aDesc, + size_t cDesc) except? 0 + cpdef reduceTensor( + intptr_t handle, size_t reduceTensorDesc, size_t indices, + size_t indicesSizeInBytes, size_t workspace, + size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, + size_t A, size_t beta, size_t cDesc, size_t C) + cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr) + cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha) + + + ############################################################################### + # Filter manipulation + ############################################################################### + + cpdef size_t createFilterDescriptor() except? 0 + cpdef setFilter4dDescriptor_v4( + size_t filterDesc, int dataType, int format, int k, int c, int h, int w) + cpdef setFilterNdDescriptor_v4( + size_t filterDesc, int dataType, int format, int nbDims, size_t filterDimA) + cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested) + cpdef destroyFilterDescriptor(size_t filterDesc) + + + ############################################################################### + # Convolution + ############################################################################### + + cpdef size_t createConvolutionDescriptor() except? 0 + cpdef setConvolutionMathType( + size_t convDesc, size_t mathType) + cpdef size_t getConvolutionMathType(size_t convDesc) except? 0 + cpdef setConvolutionGroupCount( + size_t convDesc, int groupCount) + cpdef int getConvolutionGroupCount(size_t convDesc) except? -1 + cpdef setConvolution2dDescriptor_v4( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode) + cpdef setConvolution2dDescriptor_v5( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode, size_t computeType) + cpdef setConvolutionNdDescriptor_v3( + size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, + size_t dilationA, int mode, int dataType) + cpdef destroyConvolutionDescriptor(size_t convDesc) + cpdef findConvolutionForwardAlgorithm( + intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, size_t yDesc, + int requestedAlgoCount) + cpdef list findConvolutionForwardAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionForwardAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionForwardAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionForwardAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int algo) except? -1 + cpdef convolutionForward( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t filterDesc, size_t filterData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t destDesc, size_t destData) + cpdef convolutionBackwardBias( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t beta, size_t destDesc, size_t destData) + cpdef findConvolutionBackwardFilterAlgorithm( + intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, + size_t dwDesc, int requestedAlgoCount) + cpdef list findConvolutionBackwardFilterAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionBackwardFilterAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int preference, size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionBackwardFilterAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int algo) except? -1 + cpdef convolutionBackwardFilter_v3( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData) + cpdef findConvolutionBackwardDataAlgorithm( + intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, + size_t dxDesc, int requestedAlgoCount) + cpdef list findConvolutionBackwardDataAlgorithmEx( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) + cpdef list findConvolutionBackwardDataAlgorithmEx_v7( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes) + cpdef int getConvolutionBackwardDataAlgorithm_v6( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, size_t preference, + size_t memoryLimitInbytes) except? -1 + cpdef list getConvolutionBackwardDataAlgorithm_v7( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount) + cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int algo) except? -1 + cpdef convolutionBackwardData_v3( + intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData) + + + ############################################################################### + # Pooling + ############################################################################### + + cpdef size_t createPoolingDescriptor() except? 0 + cpdef setPooling2dDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, + int windowWidth, int verticalPadding, int horizontalPadding, + int verticalStride, int horizontalStride) + cpdef setPoolingNdDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, + size_t windowDimA, size_t paddingA, size_t strideA) + cpdef destroyPoolingDescriptor(size_t poolingDesc) + cpdef poolingForward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef poolingBackward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData) + + ############################################################################### + # Batch Normalization + ############################################################################### + + cpdef deriveBNTensorDescriptor( + size_t derivedBnDesc, size_t xDesc, int mode) + + cpdef batchNormalizationForwardTraining( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance) + + cpdef batchNormalizationForwardInference( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, size_t estimatedMean, size_t estimatedVariance, + double epsilon) + + cpdef batchNormalizationBackward( + intptr_t handle, int mode, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, size_t dyDesc, + size_t dy, size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, size_t bnScale, + size_t dBnScaleResult, size_t dBnBiasResult, + double epsilon, size_t savedMean, size_t savedInvVariance) + + cpdef batchNormalizationForwardTrainingEx( + intptr_t handle, int mode, int bnOps, + size_t alpha, size_t beta, + size_t xDesc, size_t x, + size_t zDesc, size_t z, + size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, + size_t bnScale, size_t bnBias, + double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t zDesc, + size_t yDesc, + size_t bnScaleBiasMeanVarDesc, + size_t activationDesc) except? 0 + + cpdef batchNormalizationBackwardEx( + intptr_t handle, int mode, int bnops, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, + size_t yDesc, size_t y, + size_t dyDesc, size_t dy, + size_t dzDesc, size_t dz, + size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, + size_t bnScaleData, size_t bnBiasData, + size_t dBnScaleData, size_t dBnBiasData, + double epsilon, + size_t savedMean, size_t savedInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t yDesc, + size_t dyDesc, + size_t dzDesc, + size_t dxDesc, + size_t dBnScaleBiasDesc, + size_t activationDesc) except? 0 + + cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( + intptr_t handle, int mode, int bnOps, + size_t activationDesc, + size_t xDesc) except? 0 + + + ############################################################################### + # Activation + ############################################################################### + + cpdef size_t createActivationDescriptor() except? 0 + cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling) + cpdef destroyActivationDescriptor(size_t activationDesc) + cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData) + cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData) + cpdef activationBackward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData) + + + ############################################################################### + # Dropout + ############################################################################### + cpdef size_t createDropoutDescriptor() except? 0 + cpdef destroyDropoutDescriptor(size_t dropoutDesc) + cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1 + cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed) + cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0 + cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef dropoutBackward( + intptr_t handle, size_t dropoutDesc, + size_t dyDesc, size_t dyData, + size_t dxtDesc, size_t dxData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + + ############################################################################### + # CTC + ############################################################################### + + cpdef size_t createCTCLossDescriptor() except? 0 + cpdef destroyCTCLossDescriptor(size_t ctcLossDesc) + cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType) + cpdef getCTCLossDescriptor(size_t ctcLossDesc) + cpdef size_t getCTCLossWorkspaceSize( + intptr_t handle, size_t probsDesc, size_t gradientsDesc, + size_t labels, size_t labelLengths, size_t inputLengths, + int algo, size_t ctcLossDesc) except? 0 + cpdef CTCLoss( + intptr_t handle, size_t probsDesc, + size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, + size_t costs, size_t gradientsDesc, size_t gradients, int algo, + size_t ctcLossDesc, size_t workspace, size_t workSpaceSizeInBytes) + + + ############################################################################### + # RNN + ############################################################################### + + cpdef size_t createRNNDescriptor() except? 0 + cpdef destroyRNNDescriptor(size_t rnnDesc) + cpdef size_t createPersistentRNNPlan( + size_t rnnDesc, int minibatch, int dataType) except? 0 + cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan) + cpdef destroyPersistentRNNPlan(size_t plan) + cpdef setRNNDescriptor_v5( + size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int dataType) + cpdef setRNNDescriptor_v6( + intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int algo, int dataType) + cpdef setRNNPaddingMode(size_t rnnDesc, int paddingMode) + cpdef getRNNPaddingMode(size_t rnnDesc) + cpdef size_t createRNNDataDescriptor() except? 0 + cpdef destroyRNNDataDescriptor(size_t RNNDataDesc) + cpdef setRNNDataDescriptor( + size_t RNNDataDesc, int dataType, size_t layout, + int maxSeqLength, int batchSize, int vectorSize, + size_t seqLengthArray, size_t paddingFill) + cpdef getRNNDataDescriptor( + size_t RNNDataDesc, size_t dataType, + size_t layout, size_t maxSeqLength, size_t batchSize, + size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, + size_t paddingFill) + cpdef getRNNWorkspaceSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) + cpdef getRNNTrainingReserveSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc) + cpdef getRNNParamsSize( + intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType) + cpdef getRNNLinLayerMatrixParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat) + cpdef getRNNLinLayerBiasParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerBiasDesc, + size_t linLayerBias) + cpdef RNNForwardInference( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, + size_t x, size_t hxDesc, size_t hx, size_t cxDesc, + size_t cx, size_t wDesc, size_t w, size_t yDesc, + size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t workspace, size_t workSpaceSizeInBytes) + cpdef RNNForwardTraining( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t wDesc, size_t w, size_t yDesc, size_t y, + size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, + size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardData( + intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, + size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, + size_t dcxDesc, size_t dcx, size_t workspace, + size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardWeights( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, + size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNForwardInferenceEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes) + cpdef RNNForwardTrainingEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardDataEx( + intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, + size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, + size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, + size_t dkDesc, size_t dkeys, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + cpdef RNNBackwardWeightsEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t dwDesc, size_t dw, + size_t reserveSpace, size_t reserveSpaceSizeInBytes) + + + ############################################################################### + # Spatial Transformer + ############################################################################### + + cpdef size_t createSpatialTransformerDescriptor() except? 0 + cpdef destroySpatialTransformerDescriptor(size_t stDesc) + cpdef setSpatialTransformerDescriptor( + size_t stDesc, size_t samplerType, int dataType, + int nbDims, size_t dimA) + cpdef spatialTfGridGeneratorForward( + intptr_t handle, size_t stDesc, size_t theta, size_t grid) + cpdef spatialTfGridGeneratorBackward( + intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta) + cpdef spatialTfSamplerForward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t grid, size_t beta, size_t yDesc, size_t y) + cpdef spatialTfSamplerBackward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, + size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid) + + ############################################################################### + # Fused Ops + ############################################################################### + + cpdef createFusedOpsConstParamPack(int ops) + cpdef destroyFusedOpsConstParamPack(size_t constPack) + cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param) + cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param) + cpdef createFusedOpsVariantParamPack(int ops) + cpdef destroyFusedOpsVariantParamPack(size_t varPack) + cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr) + cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr) + cpdef createFusedOpsPlan(int ops) + cpdef destroyFusedOpsPlan(size_t plan) + cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack) + cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack) diff --git a/cupy_backends/cuda/libs/cudnn.pyx b/cupy_backends/cuda/libs/cudnn.pyx index 464c59d8a00..a9b89b24353 100644 --- a/cupy_backends/cuda/libs/cudnn.pyx +++ b/cupy_backends/cuda/libs/cudnn.pyx @@ -4,740 +4,2598 @@ # NOTE: This wrapper does not cover all APIs of cuDNN v4. cimport cython # NOQA from libcpp cimport vector - +from libcpp cimport bool from cupy_backends.cuda.api cimport driver from cupy_backends.cuda.api cimport runtime from cupy_backends.cuda cimport stream as stream_module - -############################################################################### -# Extern -############################################################################### - -cdef extern from '../../cupy_cudnn.h' nogil: - # Types - ctypedef int ActivationMode 'cudnnActivationMode_t' - ctypedef int AddMode 'cudnnAddMode_t' - ctypedef int BatchNormMode 'cudnnBatchNormMode_t' - ctypedef int BatchNormOps 'cudnnBatchNormOps_t' - ctypedef int ConvolutionBwdDataAlgo 'cudnnConvolutionBwdDataAlgo_t' - ctypedef int ConvolutionBwdDataPreference \ - 'cudnnConvolutionBwdDataPreference_t' - ctypedef struct ConvolutionBwdDataAlgoPerf \ - 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ - 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionBwdFilterAlgo 'cudnnConvolutionBwdFilterAlgo_t' - ctypedef int ConvolutionBwdFilterPreference \ - 'cudnnConvolutionBwdFilterPreference_t' - ctypedef struct ConvolutionBwdFilterAlgoPerf \ - 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ - 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionFwdAlgo 'cudnnConvolutionFwdAlgo_t' - ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' - ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': - int algo - int status - float time - size_t memory - ctypedef struct ConvolutionFwdAlgoPerf_v7 \ - 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 - int algo - int status - float time - size_t memory - int determinism - int mathType - ctypedef int ConvolutionMode 'cudnnConvolutionMode_t' - ctypedef int DataType 'cudnnDataType_t' - ctypedef int MathType 'cudnnMathType_t' - ctypedef int DirectionMode 'cudnnDirectionMode_t' - ctypedef int NanPropagation 'cudnnNanPropagation_t' - ctypedef int PoolingMode 'cudnnPoolingMode_t' - ctypedef int RNNInputMode 'cudnnRNNInputMode_t' - ctypedef int CTCLossAlgo 'cudnnCTCLossAlgo_t' - ctypedef int RNNMode 'cudnnRNNMode_t' - ctypedef int RNNAlgo 'cudnnRNNAlgo_t' - ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' - ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' - ctypedef int SoftmaxAlgorithm 'cudnnSoftmaxAlgorithm_t' - ctypedef int SoftmaxMode 'cudnnSoftmaxMode_t' - ctypedef int Status 'cudnnStatus_t' - ctypedef int TensorFormat 'cudnnTensorFormat_t' - ctypedef int OpTensorOp 'cudnnOpTensorOp_t' - ctypedef int ReduceTensorOp 'cudnnReduceTensorOp_t' - ctypedef int ReduceTensorIndices 'cudnnReduceTensorIndices_t' - ctypedef int IndicesType 'cudnnIndicesType_t' - ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' - ctypedef int FusedOps 'cudnnFusedOps_t' - ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' - ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' - ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' - ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' - - ctypedef void* ActivationDescriptor 'cudnnActivationDescriptor_t' - ctypedef void* ConvolutionDescriptor 'cudnnConvolutionDescriptor_t' - ctypedef void* DropoutDescriptor 'cudnnDropoutDescriptor_t' - ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' - ctypedef void* Handle 'cudnnHandle_t' - ctypedef void* PoolingDescriptor 'cudnnPoolingDescriptor_t' - ctypedef void* CTCLossDescriptor 'cudnnCTCLossDescriptor_t' - ctypedef void* RNNDescriptor 'cudnnRNNDescriptor_t' - ctypedef void* RNNDataDescriptor 'cudnnRNNDataDescriptor_t' - ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' - ctypedef void* TensorDescriptor 'cudnnTensorDescriptor_t' - ctypedef void* OpTensorDescriptor 'cudnnOpTensorDescriptor_t' - ctypedef void* ReduceTensorDescriptor 'cudnnReduceTensorDescriptor_t' - ctypedef void* SpatialTransformerDescriptor \ - 'cudnnSpatialTransformerDescriptor_t' - ctypedef void* SamplerType 'cudnnSamplerType_t' - ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' - ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' - ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' - - # Error handling - const char* cudnnGetErrorString(Status status) - - # Version - size_t cudnnGetVersion() - - # Runtime error checking - int cudnnQueryRuntimeError(Handle handle, Status *rstatus, - ErrQueryMode mode, RuntimeTag *tag) - - # Initialization and CUDA cooperation - int cudnnCreate(Handle* handle) - int cudnnDestroy(Handle handle) - int cudnnSetStream(Handle handle, driver.Stream stream) - int cudnnGetStream(Handle handle, driver.Stream* stream) - +IF CUPY_HIP_VERSION != 0: + ############################################################################### + # Extern + ############################################################################### + + cdef extern from '../../cupy_cudnn.h' nogil: + # Types + ctypedef int ActivationMode 'miopenActivationMode_t' + ctypedef int BatchNormMode 'miopenBatchNormMode_t' + ctypedef int ConvolutionBwdDataAlgo 'miopenConvBwdDataAlgorithm_t' + ctypedef int ConvolutionBwdFilterAlgo 'miopenConvBwdWeightsAlgorithm_t' + ctypedef int ConvolutionFwdAlgo 'miopenConvFwdAlgorithm_t' + ctypedef int ConvolutionMode 'miopenConvolutionMode_t' + ctypedef struct ConvolutionFwdAlgoPerf 'miopenConvAlgoPerf_t': + int fwd_algo + int bwd_weights_algo + int bwd_data_algo + float time + size_t memory + ctypedef int DataType 'miopenDataType_t' + ctypedef int DirectionMode 'miopenRNNDirectionMode_t' + ctypedef int NanPropagation 'miopenNanPropagation_t' + ctypedef int PoolingMode 'miopenPoolingMode_t' + ctypedef int RNNInputMode 'miopenRNNInputMode_t' + ctypedef int CTCLossAlgo 'miopenCTCLossAlgo_t' + ctypedef int RNNMode 'miopenRNNMode_t' + ctypedef int RNNAlgo 'miopenRNNAlgo_t' + ctypedef int RNNDataLayout 'miopenRNNBaseLayout_t' + ctypedef int RNNPaddingMode 'miopenRNNPaddingMode_t' + ctypedef int SoftmaxAlgorithm 'miopenSoftmaxAlgorithm_t' + ctypedef int SoftmaxMode 'miopenSoftmaxMode_t' + ctypedef int Status 'miopenStatus_t' + ctypedef int TensorFormat 'miopenTensorLayout_t' + ctypedef int OpTensorOp 'miopenTensorOp_t' + ctypedef int RNGType_t 'miopenRNGType_t' + ctypedef int ReduceTensorOp 'miopenReduceTensorOp_t' + ctypedef int ReduceTensorIndices 'miopenReduceTensorIndices_t' + ctypedef int IndicesType 'miopenIndicesType_t' + ctypedef void* ActivationDescriptor 'miopenActivationDescriptor_t' + ctypedef void* ConvolutionDescriptor 'miopenConvolutionDescriptor_t' + ctypedef void* DropoutDescriptor 'miopenDropoutDescriptor_t' + ctypedef void* Handle 'miopenHandle_t' + ctypedef void* PoolingDescriptor 'miopenPoolingDescriptor_t' + ctypedef void* CTCLossDescriptor 'miopenCTCLossDescriptor_t' + ctypedef void* RNNDescriptor 'miopenRNNDescriptor_t' + ctypedef void* RNNDataDescriptor 'miopenRNNDataDescriptor_t' + ctypedef void* TensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* FilterDescriptor 'miopenTensorDescriptor_t' + ctypedef void* OpTensorDescriptor 'miopenTensorDescriptor_t' + ctypedef void* ReduceTensorDescriptor 'miopenReduceTensorDescriptor_t' + ctypedef void* Stream 'miopenAcceleratorQueue_t' + # Error handling + const char* miopenGetErrorString(Status status) + + # Version + #size_t miopenGetVersion() + + # Runtime error checking + #int cudnnQueryRuntimeError(Handle handle, Status *rstatus, + # ErrQueryMode mode, RuntimeTag *tag) + + # Initialization and CUDA cooperation + int miopenCreate(Handle* handle) + int miopenDestroy(Handle handle) + int miopenSetStream(Handle handle, driver.Stream stream) + int miopenGetStream(Handle handle, driver.Stream* stream) + + # Tensor manipulation + int miopenCreateTensorDescriptor(TensorDescriptor* descriptor) + int miopenSet4dTensorDescriptor( + TensorDescriptor tensorDesc, + DataType dataType, int n, int c, int h, int w) + int miopenSet4dTensorDescriptorEx( + TensorDescriptor tensorDesc, DataType dataType, + int n, int c, int h, int w, + int nStride, int cStride, int hStride, int wStride) + int miopenSetTensorDescriptor( + TensorDescriptor tensorDesc, DataType dataType, + int nbDims, const int filterDimA[], const int* stride) + int miopenGetTensorDescriptor( + FilterDescriptor wDesc, DataType* dataType, + int* nbDims, int filterDimA[], int* stride) + int miopenGet4dTensorDescriptor( + TensorDescriptor tensorDesc, DataType* dataType, + int* n, int* c, int* h, int* w, + int* nStride, int* cStride, int* hStride, int* wStride) + int miopenDestroyTensorDescriptor(TensorDescriptor tensorDesc) + + # Tensor operations + int miopenOpTensor( + Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, + TensorDescriptor aDesc, void* A, void* alpha2, + TensorDescriptor bDesc, void* B, void* beta, + TensorDescriptor cDesc, void* C) + + # Tensor reductions + int miopenCreateReduceTensorDescriptor( + ReduceTensorDescriptor* reduceTensorDesc) + int miopenSetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, ReduceTensorOp reduceTensorOp, + DataType reduceTensorCompType, NanPropagation reduceTensorNanOpt, + ReduceTensorIndices reduceTensorIndices, + IndicesType reduceTensorIndicesType) + int miopenGetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, + ReduceTensorOp* reduceTensorOp, DataType* reduceTensorCompType, + NanPropagation* reduceTensorNanOpt, + ReduceTensorIndices* reduceTensorIndices, + IndicesType* reduceTensorIndicesType) + int miopenDestroyReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc) + int miopenGetReductionIndicesSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int miopenGetReductionWorkspaceSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int miopenReduceTensor( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, void* indices, + size_t indicesSizeInBytes, void* workspace, + size_t workspaceSizeInBytes, void* alpha, TensorDescriptor aDesc, + void* A, void* beta, TensorDescriptor cDesc, void* c) + int miopenSetTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* valuePtr) + int miopenScaleTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* alpha) + + # Filter manipulation + + # Convolution + int miopenCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) + int miopenSetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int groupCount) + int miopenGetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int *groupCount) + int miopenInitConvolutionDescriptor(ConvolutionDescriptor convDesc, + ConvolutionMode mode, int pad_h, int pad_w, int stride_h, int stride_w, + int dilation_h, int dilation_w) + int miopenInitConvolutionNdDescriptor(ConvolutionDescriptor conDesc, int spatialDim, + const int* padA, const int* strideA,const int* dilationA, ConvolutionMode mode) + int miopenDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) + int miopenConvolutionForwardGetWorkSpaceSize( + Handle handle, TensorDescriptor srcDesc, + TensorDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, + size_t* sizeInBytes) + int miopenConvolutionBackwardDataGetWorkSpaceSize( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + size_t* sizeInBytes) + int miopenFindConvolutionForwardAlgorithm( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes, bool exhaustiveSearch) + int miopenConvolutionForward( + Handle handle, void* alpha, TensorDescriptor srcDesc, + void* srcData, FilterDescriptor filterDesc, void* filterData, + ConvolutionDescriptor convDesc, ConvolutionFwdAlgo algo, + void* beta, TensorDescriptor destDesc, void* destData, + void* workSpace, size_t workSpaceSizeInBytes) + + # Pooling + int miopenCreatePoolingDescriptor(PoolingDescriptor* desc) + int miopenDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) + # Batch Normalization + int miopenDeriveBNTensorDescriptor( + TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, + BatchNormMode mode) + int miopenBatchNormalizationForwardTraining( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, void* resultSaveMean, + void* resultSaveInvVariance) + int miopenBatchNormalizationForwardInference( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, void* estimatedMean, void* estimatedVariance, + double epsilon) + int miopenBatchNormalizationBackward( + Handle handle, BatchNormMode mode, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, void* bnScale, + void* dBnScaleResult, void* dBnBiasResult, + double epsilon, void* savedMean, void* savedInvVariance) + + + # Activation + int miopenCreateActivationDescriptor( + ActivationDescriptor* activationDesc) + int miopenSetActivationDescriptor( + ActivationDescriptor activationDesc, ActivationMode mode, double activAlpha, + double activBeta, + double activGamma) + int miopenDestroyActivationDescriptor( + ActivationDescriptor activationDesc) + int miopenSoftmaxForward( + Handle handle, + void* alpha, TensorDescriptor srcDesc, void* srcData, + void* beta, TensorDescriptor dstDesc, void* dstData) + int miopenSoftmaxBackward( + Handle handle, + void* alpha, TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + int miopenActivationForward( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int miopenActivationBackward( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + + # Dropout + int miopenCreateDropoutDescriptor(DropoutDescriptor* desc) + int miopenDestroyDropoutDescriptor(DropoutDescriptor dropoutDesc) + int miopenDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) + int miopenDropoutGetReserveSpaceSize( + TensorDescriptor xDesc, size_t* sizeInBytes) + int miopenSetDropoutDescriptor( + DropoutDescriptor dropoutDesc, Handle handle, float dropout, + void* states, size_t stateSizeInBytes, unsigned long long seed, + bool use_mask, bool state_evo, RNGType_t rng_mode) + int miopenDropoutForward( + Handle handle, DropoutDescriptor dropoutDesc, TensorDescriptor noise_shape, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor dstDesc, void* dstData, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + # CTC + int miopenCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) + int miopenDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) + int miopenGetCTCLossWorkspaceSize( + Handle handle, TensorDescriptor probsDesc, + TensorDescriptor gradientsDesc, int* labels, + int* labelLengths, int* inputLengths, CTCLossAlgo algo, + CTCLossDescriptor ctcLossDesc, size_t* sizeInBytes) + int miopenCTCLoss( + Handle handle, TensorDescriptor probsDesc, + void* probs, int* labels, int* labelLengths, int* inputLengths, + void* costs, TensorDescriptor gradientsDesc, void* gradients, + CTCLossAlgo algo, CTCLossDescriptor ctcLossDesc, + void* workspace, size_t workSpaceSizeInBytes) + # RNN + int miopenCreateRNNDescriptor(RNNDescriptor* rnnDesc) + int miopenDestroyRNNDescriptor(RNNDescriptor rnnDesc) + int miopenGetRNNWorkspaceSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int miopenGetRNNTrainingReserveSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int miopenGetRNNParamsSize( + Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, + size_t* sizeInBytes, DataType dataType) + int miopenRNNForwardInference( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, + void* x, TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, + void* cx, FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, + void* y, TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, + void* cy, void* workspace, size_t workSpaceSizeInBytes) + int miopenRNNForwardTraining( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, + TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, void* cx, + FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, void* y, + TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, + void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) +ELSE: + ############################################################################### + # Extern + ############################################################################### + + cdef extern from '../../cupy_cudnn.h' nogil: + # Types + ctypedef int ActivationMode 'cudnnActivationMode_t' + ctypedef int AddMode 'cudnnAddMode_t' + ctypedef int BatchNormMode 'cudnnBatchNormMode_t' + ctypedef int BatchNormOps 'cudnnBatchNormOps_t' + ctypedef int ConvolutionBwdDataAlgo 'cudnnConvolutionBwdDataAlgo_t' + ctypedef int ConvolutionBwdDataPreference \ + 'cudnnConvolutionBwdDataPreference_t' + ctypedef struct ConvolutionBwdDataAlgoPerf \ + 'cudnnConvolutionBwdDataAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdDataAlgoPerf_v7 \ + 'cudnnConvolutionBwdDataAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionBwdFilterAlgo 'cudnnConvolutionBwdFilterAlgo_t' + ctypedef int ConvolutionBwdFilterPreference \ + 'cudnnConvolutionBwdFilterPreference_t' + ctypedef struct ConvolutionBwdFilterAlgoPerf \ + 'cudnnConvolutionBwdFilterAlgoPerf_t': # NOQA: E125 + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionBwdFilterAlgoPerf_v7 \ + 'cudnnConvolutionBwdFilterAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionFwdAlgo 'cudnnConvolutionFwdAlgo_t' + ctypedef int ConvolutionFwdPreference 'cudnnConvolutionFwdPreference_t' + ctypedef struct ConvolutionFwdAlgoPerf 'cudnnConvolutionFwdAlgoPerf_t': + int algo + int status + float time + size_t memory + ctypedef struct ConvolutionFwdAlgoPerf_v7 \ + 'cudnnConvolutionFwdAlgoPerf_v7_t': # NOQA: E125 + int algo + int status + float time + size_t memory + int determinism + int mathType + ctypedef int ConvolutionMode 'cudnnConvolutionMode_t' + ctypedef int DataType 'cudnnDataType_t' + ctypedef int MathType 'cudnnMathType_t' + ctypedef int DirectionMode 'cudnnDirectionMode_t' + ctypedef int NanPropagation 'cudnnNanPropagation_t' + ctypedef int PoolingMode 'cudnnPoolingMode_t' + ctypedef int RNNInputMode 'cudnnRNNInputMode_t' + ctypedef int CTCLossAlgo 'cudnnCTCLossAlgo_t' + ctypedef int RNNMode 'cudnnRNNMode_t' + ctypedef int RNNAlgo 'cudnnRNNAlgo_t' + ctypedef int RNNDataLayout 'cudnnRNNDataLayout_t' + ctypedef int RNNPaddingMode 'cudnnRNNPaddingMode_t' + ctypedef int SoftmaxAlgorithm 'cudnnSoftmaxAlgorithm_t' + ctypedef int SoftmaxMode 'cudnnSoftmaxMode_t' + ctypedef int Status 'cudnnStatus_t' + ctypedef int TensorFormat 'cudnnTensorFormat_t' + ctypedef int OpTensorOp 'cudnnOpTensorOp_t' + ctypedef int ReduceTensorOp 'cudnnReduceTensorOp_t' + ctypedef int ReduceTensorIndices 'cudnnReduceTensorIndices_t' + ctypedef int IndicesType 'cudnnIndicesType_t' + ctypedef int ErrQueryMode 'cudnnErrQueryMode_t' + ctypedef int FusedOps 'cudnnFusedOps_t' + ctypedef int FusedOpsConstParamLabel 'cudnnFusedOpsConstParamLabel_t' + ctypedef int FusedOpsPointerPlaceHolder 'cudnnFusedOpsPointerPlaceHolder_t' + ctypedef int FusedOpsVariantParamLabel 'cudnnFusedOpsVariantParamLabel_t' + ctypedef struct RuntimeTag 'cudnnRuntimeTag_t' + + ctypedef void* ActivationDescriptor 'cudnnActivationDescriptor_t' + ctypedef void* ConvolutionDescriptor 'cudnnConvolutionDescriptor_t' + ctypedef void* DropoutDescriptor 'cudnnDropoutDescriptor_t' + ctypedef void* FilterDescriptor 'cudnnFilterDescriptor_t' + ctypedef void* Handle 'cudnnHandle_t' + ctypedef void* PoolingDescriptor 'cudnnPoolingDescriptor_t' + ctypedef void* CTCLossDescriptor 'cudnnCTCLossDescriptor_t' + ctypedef void* RNNDescriptor 'cudnnRNNDescriptor_t' + ctypedef void* RNNDataDescriptor 'cudnnRNNDataDescriptor_t' + ctypedef void* PersistentRNNPlan 'cudnnPersistentRNNPlan_t' + ctypedef void* TensorDescriptor 'cudnnTensorDescriptor_t' + ctypedef void* OpTensorDescriptor 'cudnnOpTensorDescriptor_t' + ctypedef void* ReduceTensorDescriptor 'cudnnReduceTensorDescriptor_t' + ctypedef void* SpatialTransformerDescriptor \ + 'cudnnSpatialTransformerDescriptor_t' + ctypedef void* SamplerType 'cudnnSamplerType_t' + ctypedef void* FusedOpsConstParamPack 'cudnnFusedOpsConstParamPack_t' + ctypedef void* FusedOpsVariantParamPack 'cudnnFusedOpsVariantParamPack_t' + ctypedef void* FusedOpsPlan 'cudnnFusedOpsPlan_t' + + # Error handling + const char* cudnnGetErrorString(Status status) + + # Version + size_t cudnnGetVersion() + + # Runtime error checking + int cudnnQueryRuntimeError(Handle handle, Status *rstatus, + ErrQueryMode mode, RuntimeTag *tag) + + # Initialization and CUDA cooperation + int cudnnCreate(Handle* handle) + int cudnnDestroy(Handle handle) + int cudnnSetStream(Handle handle, driver.Stream stream) + int cudnnGetStream(Handle handle, driver.Stream* stream) + + # Tensor manipulation + int cudnnCreateTensorDescriptor(TensorDescriptor* descriptor) + int cudnnSetTensor4dDescriptor( + TensorDescriptor tensorDesc, TensorFormat format, + DataType dataType, int n, int c, int h, int w) + int cudnnSetTensor4dDescriptorEx( + TensorDescriptor tensorDesc, DataType dataType, + int n, int c, int h, int w, + int nStride, int cStride, int hStride, int wStride) + int cudnnGetTensor4dDescriptor( + TensorDescriptor tensorDesc, DataType* dataType, + int* n, int* c, int* h, int* w, + int* nStride, int* cStride, int* hStride, int* wStride) + int cudnnSetTensorNdDescriptor( + TensorDescriptor tensorDesc, DataType dataType, int nbDims, + int* dimA, int* strideA) + int cudnnDestroyTensorDescriptor(TensorDescriptor tensorDesc) + int cudnnAddTensor_v3( + Handle handle, void* alpha, TensorDescriptor bDesc, + void* b, void* beta, TensorDescriptor yDesc, void* y) + + # Tensor operations + int cudnnCreateOpTensorDescriptor(OpTensorDescriptor* opTensorDesc) + int cudnnSetOpTensorDescriptor( + OpTensorDescriptor opTensorDesc, OpTensorOp opTensorOp, + DataType opTensorCompType, NanPropagation opTensorNanOpt) + int cudnnGetOpTensorDescriptor( + OpTensorDescriptor opTensorDesc, OpTensorOp* opTensorOp, + DataType* opTensorCompType, NanPropagation* opTensorNanOpt) + int cudnnDestroyOpTensorDescriptor(OpTensorDescriptor opTensorDesc) + int cudnnOpTensor( + Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, + TensorDescriptor aDesc, void* A, void* alpha2, + TensorDescriptor bDesc, void* B, void* beta, + TensorDescriptor cDesc, void* C) + + # Tensor reductions + int cudnnCreateReduceTensorDescriptor( + ReduceTensorDescriptor* reduceTensorDesc) + int cudnnSetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, ReduceTensorOp reduceTensorOp, + DataType reduceTensorCompType, NanPropagation reduceTensorNanOpt, + ReduceTensorIndices reduceTensorIndices, + IndicesType reduceTensorIndicesType) + int cudnnGetReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc, + ReduceTensorOp* reduceTensorOp, DataType* reduceTensorCompType, + NanPropagation* reduceTensorNanOpt, + ReduceTensorIndices* reduceTensorIndices, + IndicesType* reduceTensorIndicesType) + int cudnnDestroyReduceTensorDescriptor( + ReduceTensorDescriptor reduceTensorDesc) + int cudnnGetReductionIndicesSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int cudnnGetReductionWorkspaceSize( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, + TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) + int cudnnReduceTensor( + Handle handle, ReduceTensorDescriptor reduceTensorDesc, void* indices, + size_t indicesSizeInBytes, void* workspace, + size_t workspaceSizeInBytes, void* alpha, TensorDescriptor aDesc, + void* A, void* beta, TensorDescriptor cDesc, void* c) + int cudnnSetTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* valuePtr) + int cudnnScaleTensor( + Handle handle, TensorDescriptor yDesc, void* y, void* alpha) + + # Filter manipulation + int cudnnCreateFilterDescriptor(FilterDescriptor* filterDesc) + int cudnnSetFilter4dDescriptor_v4( + FilterDescriptor filterDesc, DataType dataType, + TensorFormat format, int k, int c, int h, int w) + int cudnnSetFilterNdDescriptor_v4( + FilterDescriptor filterDesc, DataType dataType, + TensorFormat format, int nbDims, const int filterDimA[]) + int cudnnGetFilterNdDescriptor_v4( + FilterDescriptor wDesc, int nbDimsRequested, DataType* dataType, + TensorFormat* format, int* nbDims, int filterDimA[]) + int cudnnDestroyFilterDescriptor(FilterDescriptor filterDesc) + + # Convolution + int cudnnCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) + int cudnnSetConvolutionMathType( + ConvolutionDescriptor convDesc, MathType mathType) + int cudnnGetConvolutionMathType( + ConvolutionDescriptor convDesc, MathType *mathType) + int cudnnSetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int groupCount) + int cudnnGetConvolutionGroupCount( + ConvolutionDescriptor convDesc, int *groupCount) + int cudnnSetConvolution2dDescriptor_v4( + ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, + int v, int dilation_h, int dilation_w, ConvolutionMode mode) + int cudnnSetConvolution2dDescriptor_v5( + ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, + int v, int dilation_h, int dilation_w, ConvolutionMode mode, + DataType computeType) + int cudnnSetConvolutionNdDescriptor_v3( + ConvolutionDescriptor convDesc, int arrayLength, int* padA, + int* filterStrideA, int* dilationA, ConvolutionMode mode, + DataType dataType) + int cudnnDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) + int cudnnFindConvolutionForwardAlgorithm( + Handle handle, TensorDescriptor xDesc, FilterDescriptor wDesc, + ConvolutionDescriptor convDesc, TensorDescriptor yDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionFwdAlgoPerf* perfResults) + int cudnnFindConvolutionForwardAlgorithmEx( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionForwardAlgorithmEx_v7( + Handle handle, TensorDescriptor xDesc, void* x, + FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, + TensorDescriptor yDesc, void* y, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionForwardAlgorithm_v6( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, ConvolutionFwdPreference preference, + size_t memoryLimitInbytes, ConvolutionFwdAlgo* algo) + int cudnnGetConvolutionForwardAlgorithm_v7( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults) + int cudnnGetConvolutionForwardWorkspaceSize( + Handle handle, TensorDescriptor srcDesc, + FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, + TensorDescriptor destDesc, ConvolutionFwdAlgo algo, + size_t* sizeInBytes) + int cudnnConvolutionForward( + Handle handle, void* alpha, TensorDescriptor srcDesc, + void* srcData, FilterDescriptor filterDesc, void* filterData, + ConvolutionDescriptor convDesc, ConvolutionFwdAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + TensorDescriptor destDesc, void* destData) + int cudnnConvolutionBackwardBias( + Handle handle, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor destDesc, void* destData) + int cudnnFindConvolutionBackwardFilterAlgorithm( + Handle handle, TensorDescriptor xDesc, TensorDescriptor dyDesc, + ConvolutionDescriptor convDesc, FilterDescriptor dwDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdFilterAlgoPerf* perfResults) + int cudnnFindConvolutionBackwardFilterAlgorithmEx( + Handle handle, TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( + Handle handle, TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionBackwardFilterAlgorithm_v6( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, + ConvolutionBwdFilterPreference preference, + size_t memoryLimitInbytes, ConvolutionBwdFilterAlgo* algo) + int cudnnGetConvolutionBackwardFilterAlgorithm_v7( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdFilterAlgoPerf_v7* perfResults) + int cudnnGetConvolutionBackwardFilterWorkspaceSize( + Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, + ConvolutionBwdFilterAlgo algo, size_t* sizeInBytes) + int cudnnConvolutionBackwardFilter_v3( + Handle handle, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor diffDesc, void* diffData, + ConvolutionDescriptor convDesc, ConvolutionBwdFilterAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + FilterDescriptor gradDesc, void* gradData) + int cudnnGetConvolutionBackwardDataAlgorithm_v6( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + ConvolutionBwdDataPreference preference, + size_t memoryLimitInbytes, ConvolutionBwdDataAlgo* algo) + int cudnnGetConvolutionBackwardDataAlgorithm_v7( + Handle handle, TensorDescriptor filterDesc, TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdDataAlgoPerf_v7* perfResults) + int cudnnFindConvolutionBackwardDataAlgorithm( + Handle handle, TensorDescriptor wDesc, TensorDescriptor dyDesc, + ConvolutionDescriptor convDesc, FilterDescriptor dxDesc, + int requestedAlgoCount, int* returnedAlgoCount, + ConvolutionBwdDataAlgoPerf* perfResults) + int cudnnFindConvolutionBackwardDataAlgorithmEx( + Handle handle, FilterDescriptor wDesc, void* w, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnFindConvolutionBackwardDataAlgorithmEx_v7( + Handle handle, FilterDescriptor wDesc, void* w, + TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, + TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, + int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf_v7* perfResults, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnGetConvolutionBackwardDataWorkspaceSize( + Handle handle, FilterDescriptor filterDesc, + TensorDescriptor diffDesc, + ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, + ConvolutionBwdDataAlgo algo, size_t* sizeInBytes) + int cudnnConvolutionBackwardData_v3( + Handle handle, void* alpha, + FilterDescriptor filterDesc, void* filterData, + TensorDescriptor diffDesc, void* diffData, + ConvolutionDescriptor convDesc, ConvolutionBwdDataAlgo algo, + void* workSpace, size_t workSpaceSizeInBytes, void* beta, + TensorDescriptor gradDesc, void* gradData) + + # Pooling + int cudnnCreatePoolingDescriptor(PoolingDescriptor* desc) + int cudnnSetPooling2dDescriptor_v4( + PoolingDescriptor poolingDesc, PoolingMode mode, + NanPropagation maxpoolingNanOpt, int windowHeight, int windowWidth, + int verticalPadding, int horizontalPadding, int verticalStride, + int horizontalStride) + int cudnnSetPoolingNdDescriptor_v4( + PoolingDescriptor poolingDesc, PoolingMode mode, + NanPropagation maxpoolingNanOpt, int nbDims, + int* windowDimA, int* paddingA, int* strideA) + int cudnnDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) + int cudnnPoolingForward( + Handle handle, PoolingDescriptor poolingDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int cudnnPoolingBackward( + Handle handle, PoolingDescriptor poolingDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + # Batch Normalization + int cudnnDeriveBNTensorDescriptor( + TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, + BatchNormMode mode) + int cudnnBatchNormalizationForwardTraining( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, void* resultSaveMean, + void* resultSaveInvVariance) + int cudnnBatchNormalizationForwardInference( + Handle handle, BatchNormMode mode, + void* alpha, void* beta, TensorDescriptor xDesc, + void* x, TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, + void* bnBias, void* estimatedMean, void* estimatedVariance, + double epsilon) + int cudnnBatchNormalizationBackward( + Handle handle, BatchNormMode mode, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, void* bnScale, + void* dBnScaleResult, void* dBnBiasResult, + double epsilon, void* savedMean, void* savedInvVariance) + + int cudnnBatchNormalizationForwardTrainingEx( + Handle handle, + BatchNormMode mode, BatchNormOps bnOps, + void* alpha, void* beta, + TensorDescriptor xDesc, void* x, + TensorDescriptor zDesc, void* z, + TensorDescriptor yDesc, void* y, + TensorDescriptor bnScaleBiasMeanVarDesc, + void* bnScale, void* bnBias, + double exponentialAverageFactor, + void* resultRunningMean, void* resultRunningVariance, + double epsilon, + void* resultSaveMean, void* resultSaveInvVariance, + ActivationDescriptor activationDesc, + void* workspace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + Handle handle, + BatchNormMode mode, BatchNormOps bnOps, + TensorDescriptor xDesc, + TensorDescriptor zDesc, + TensorDescriptor yDesc, + TensorDescriptor bnScaleBiasMeanVarDesc, + ActivationDescriptor activationDesc, + size_t* sizeInBytes) + int cudnnBatchNormalizationBackwardEx( + Handle handle, + BatchNormMode mode, BatchNormOps bnops, + void* alphaDataDiff, void* betaDataDiff, + void* alphaParamDiff, void* betaParamDiff, + TensorDescriptor xDesc, void* x, + TensorDescriptor yDesc, void* y, + TensorDescriptor dyDesc, void* dy, + TensorDescriptor dzDesc, void* dz, + TensorDescriptor dxDesc, void* dx, + TensorDescriptor dBnScaleBiasDesc, + void* bnScaleData, void* bnBiasData, + void* dBnScaleData, void* dBnBiasData, + double epsilon, + void* savedMean, void* savedInvVariance, + ActivationDescriptor activationDesc, + void* workspace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnGetBatchNormalizationBackwardExWorkspaceSize( + Handle handle, + BatchNormMode mode, + BatchNormOps bnOps, + TensorDescriptor xDesc, + TensorDescriptor yDesc, + TensorDescriptor dyDesc, + TensorDescriptor dzDesc, + TensorDescriptor dxDesc, + TensorDescriptor dBnScaleBiasDesc, + ActivationDescriptor activationDesc, + size_t* sizeInBytes) + int cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + Handle handle, + BatchNormMode mode, + BatchNormOps bnOps, + ActivationDescriptor activationDesc, + TensorDescriptor xDesc, + size_t* sizeInBytes) + + # Activation + int cudnnCreateActivationDescriptor( + ActivationDescriptor* activationDesc) + int cudnnSetActivationDescriptor( + ActivationDescriptor activationDesc, ActivationMode mode, + NanPropagation reluNanOpt, double reluCeiling) + int cudnnDestroyActivationDescriptor( + ActivationDescriptor activationDesc) + int cudnnSoftmaxForward( + Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, + void* alpha, TensorDescriptor srcDesc, void* srcData, + void* beta, TensorDescriptor dstDesc, void* dstData) + int cudnnSoftmaxBackward( + Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, + void* alpha, TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + int cudnnActivationForward_v4( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, void* beta, + TensorDescriptor dstDesc, void* dstData) + int cudnnActivationBackward_v4( + Handle handle, ActivationDescriptor activationDesc, void* alpha, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor srcDiffDesc, void* srcDiffData, + TensorDescriptor destDesc, void* destData, void* beta, + TensorDescriptor destDiffDesc, void* destDiffData) + + # Dropout + int cudnnCreateDropoutDescriptor(DropoutDescriptor* desc) + int cudnnDestroyDropoutDescriptor(DropoutDescriptor dropoutDesc) + int cudnnDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) + int cudnnDropoutGetReserveSpaceSize( + TensorDescriptor xDesc, size_t* sizeInBytes) + int cudnnSetDropoutDescriptor( + DropoutDescriptor dropoutDesc, Handle handle, float dropout, + void* states, size_t stateSizeInBytes, unsigned long long seed) + int cudnnDropoutForward( + Handle handle, DropoutDescriptor dropoutDesc, + TensorDescriptor srcDesc, void* srcData, + TensorDescriptor dstDesc, void* dstData, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnDropoutBackward( + Handle handle, DropoutDescriptor dropoutDesc, + TensorDescriptor dydesc, void* dy, TensorDescriptor dxdesc, + void* dx, void* reserveSpace, size_t reserveSpaceSizeInBytes) + + # CTC + int cudnnCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) + int cudnnDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) + int cudnnSetCTCLossDescriptor( + CTCLossDescriptor ctcLossDesc, DataType dataType) + int cudnnGetCTCLossDescriptor( + CTCLossDescriptor ctcLossDesc, DataType* dataType) + int cudnnGetCTCLossWorkspaceSize( + Handle handle, TensorDescriptor probsDesc, + TensorDescriptor gradientsDesc, int* labels, + int* labelLengths, int* inputLengths, CTCLossAlgo algo, + CTCLossDescriptor ctcLossDesc, size_t* sizeInBytes) + int cudnnCTCLoss( + Handle handle, TensorDescriptor probsDesc, + void* probs, int* labels, int* labelLengths, int* inputLengths, + void* costs, TensorDescriptor gradientsDesc, void* gradients, + CTCLossAlgo algo, CTCLossDescriptor ctcLossDesc, + void* workspace, size_t workSpaceSizeInBytes) + # RNN + int cudnnCreateRNNDescriptor(RNNDescriptor* rnnDesc) + int cudnnDestroyRNNDescriptor(RNNDescriptor rnnDesc) + int cudnnCreatePersistentRNNPlan( + RNNDescriptor rnnDesc, + const int minibatch, DataType dataType, + PersistentRNNPlan* plan) + int cudnnSetPersistentRNNPlan( + RNNDescriptor rnnDesc, PersistentRNNPlan plan) + int cudnnDestroyPersistentRNNPlan(PersistentRNNPlan plan) + int cudnnSetRNNDescriptor_v5( + RNNDescriptor rnnDesc, int hiddenSize, + int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, + DirectionMode direction, RNNMode mode, DataType dataType) + int cudnnSetRNNDescriptor_v6( + Handle handle, RNNDescriptor rnnDesc, int hiddenSize, + int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, + DirectionMode direction, RNNMode mode, RNNAlgo algo, DataType dataType) + int cudnnSetRNNPaddingMode( + RNNDescriptor rnnDesc, RNNPaddingMode paddingMode) + int cudnnGetRNNPaddingMode( + RNNDescriptor rnnDesc, RNNPaddingMode* paddingMode) + int cudnnCreateRNNDataDescriptor(RNNDataDescriptor* RNNDataDesc) + int cudnnDestroyRNNDataDescriptor(RNNDataDescriptor RNNDataDesc) + int cudnnSetRNNDataDescriptor( + RNNDataDescriptor RNNDataDesc, DataType dataType, RNNDataLayout layout, + int maxSeqLength, int batchSize, int vectorSize, + const int seqLengthArray[], void *paddingFill) + int cudnnGetRNNDataDescriptor( + RNNDataDescriptor RNNDataDesc, DataType* dataType, + RNNDataLayout* layout, int* maxSeqLength, int* batchSize, + int* vectorSize, int arrayLengthRequested, int seqLengthArray[], + void* paddingFill) + int cudnnGetRNNWorkspaceSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int cudnnGetRNNTrainingReserveSize( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, size_t* sizeInBytes) + int cudnnGetRNNParamsSize( + Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, + size_t* sizeInBytes, DataType dataType) + int cudnnGetRNNLinLayerMatrixParams( + Handle handle, RNNDescriptor rnnDesc, int layer, + TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, + int linLayerID, FilterDescriptor linLayerMatDesc, + void** linLayerMat) + int cudnnGetRNNLinLayerBiasParams( + Handle handle, RNNDescriptor rnnDesc, int layer, + TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, + int linLayerID, FilterDescriptor linLayerBiasDesc, + void** linLayerBias) + int cudnnRNNForwardInference( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, + void* x, TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, + void* cx, FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, + void* y, TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, + void* cy, void* workspace, size_t workSpaceSizeInBytes) + int cudnnRNNForwardTraining( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, + TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, void* cx, + FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, void* y, + TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, + void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardData( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* yDesc, void* y, + TensorDescriptor* dyDesc, void* dy, + TensorDescriptor dhyDesc, void* dhy, + TensorDescriptor dcyDesc, void* dcy, + FilterDescriptor wDesc, void* w, + TensorDescriptor hxDesc, void* hx, + TensorDescriptor cxDesc, void* cx, + TensorDescriptor* dxDesc, void* dx, + TensorDescriptor dhxDesc, void* dhx, + TensorDescriptor dcxDesc, void* dcx, void* workspace, + size_t workSpaceSizeInBytes, void* reserveSpace, + size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardWeights( + Handle handle, RNNDescriptor rnnDesc, int seqLength, + TensorDescriptor* xDesc, void* x, TensorDescriptor hxDesc, void* hx, + TensorDescriptor* yDesc, void* y, + void* workspace, size_t workSpaceSizeInBytes, FilterDescriptor dwDesc, + void* dw, void* reserveSpace, size_t reserveSpaceSizeInBytes) + + int cudnnRNNForwardInferenceEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + FilterDescriptor wDesc, const void* w, + RNNDataDescriptor yDesc, void* y, + TensorDescriptor hyDesc, void* hy, + TensorDescriptor cyDesc, void* cy, + RNNDataDescriptor kDesc, const void* keys, + RNNDataDescriptor cDesc, void* cAttn, + RNNDataDescriptor iDesc, void* iAttn, + RNNDataDescriptor qDesc, void* queries, + void* workSpace, size_t workSpaceSizeInBytes) + int cudnnRNNForwardTrainingEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + FilterDescriptor wDesc, const void* w, + RNNDataDescriptor yDesc, void* y, + TensorDescriptor hyDesc, void* hy, + TensorDescriptor cyDesc, void* cy, + RNNDataDescriptor kDesc, const void* keys, + RNNDataDescriptor cDesc, void* cAttn, + RNNDataDescriptor iDesc, void* iAttn, + RNNDataDescriptor qDesc, void* queries, + void* workSpace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardDataEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor yDesc, const void* y, + RNNDataDescriptor dyDesc, const void* dy, + RNNDataDescriptor dcDesc, const void* dcAttn, + TensorDescriptor dhyDesc, const void* dhy, + TensorDescriptor dcyDesc, const void* dcy, + FilterDescriptor wDesc, const void* w, + TensorDescriptor hxDesc, const void* hx, + TensorDescriptor cxDesc, const void* cx, + RNNDataDescriptor dxDesc, void* dx, + TensorDescriptor dhxDesc, void* dhx, + TensorDescriptor dcxDesc, void* dcx, + RNNDataDescriptor dkDesc, void* dkeys, + void* workSpace, size_t workSpaceSizeInBytes, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + int cudnnRNNBackwardWeightsEx( + Handle handle, RNNDescriptor rnnDesc, + RNNDataDescriptor xDesc, const void* x, + TensorDescriptor hxDesc, const void* hx, + RNNDataDescriptor yDesc, const void* y, + void* workSpace, size_t workSpaceSizeInBytes, + FilterDescriptor dwDesc, void* dw, + void* reserveSpace, size_t reserveSpaceSizeInBytes) + + # Spatial Transformer + int cudnnCreateSpatialTransformerDescriptor( + SpatialTransformerDescriptor* stDesc) + int cudnnDestroySpatialTransformerDescriptor( + SpatialTransformerDescriptor stDesc) + int cudnnSetSpatialTransformerNdDescriptor( + SpatialTransformerDescriptor stDesc, SamplerType samplerType, + DataType dataType, int nbDims, int dimA[]) + int cudnnSpatialTfGridGeneratorForward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* theta, void* grid) + int cudnnSpatialTfGridGeneratorBackward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* dgrid, void* dtheta) + int cudnnSpatialTfSamplerForward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* alpha, TensorDescriptor xDesc, void* x, + void* grid, void* beta, TensorDescriptor yDesc, void* y) + int cudnnSpatialTfSamplerBackward( + Handle handle, SpatialTransformerDescriptor stDesc, + void* alpha, TensorDescriptor xDesc, void* x, void* beta, + TensorDescriptor dxDesc, void* dx, void* alphaDgrid, + TensorDescriptor dyDesc, void* dy, void* grid, + void* betaDgrid, void* dgrid) + + # Fused Ops + int cudnnCreateFusedOpsConstParamPack( + FusedOpsConstParamPack* constPack, int ops) + int cudnnDestroyFusedOpsConstParamPack(FusedOpsConstParamPack constPack) + int cudnnSetFusedOpsConstParamPackAttribute( + FusedOpsConstParamPack constPack, FusedOpsConstParamLabel paramLabel, + const void *param) + int cudnnGetFusedOpsConstParamPackAttribute( + const FusedOpsConstParamPack constPack, + FusedOpsConstParamLabel paramLabel, void *param, int *isNULL) + int cudnnCreateFusedOpsVariantParamPack( + FusedOpsVariantParamPack *varPack, FusedOps ops) + int cudnnDestroyFusedOpsVariantParamPack(FusedOpsVariantParamPack varPack) + int cudnnSetFusedOpsVariantParamPackAttribute( + FusedOpsVariantParamPack varPack, FusedOpsVariantParamLabel paramLabel, + void *ptr) + int cudnnGetFusedOpsVariantParamPackAttribute( + const FusedOpsVariantParamPack varPack, + FusedOpsVariantParamLabel paramLabel, void *ptr) + int cudnnCreateFusedOpsPlan(FusedOpsPlan *plan, FusedOps ops) + int cudnnDestroyFusedOpsPlan(FusedOpsPlan plan) + int cudnnMakeFusedOpsPlan( + Handle handle, FusedOpsPlan plan, + const FusedOpsConstParamPack constPack, size_t *workspaceSizeInBytes) + int cudnnFusedOpsExecute( + Handle handle, const FusedOpsPlan plan, + FusedOpsVariantParamPack varPack) + + # Build-time version + int CUDNN_VERSION + + # Constants + double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' + + cdef class CuDNNAlgoPerf: + + def __init__(self, algo, status, time, memory, determinism, mathType): + self.algo = algo + self.status = status + self.time = time + self.memory = memory + self.determinism = determinism + self.mathType = mathType + + cpdef queryRuntimeError(intptr_t handle, int mode): + cdef Status rstatus + with nogil: + status = cudnnQueryRuntimeError(handle, &rstatus, + mode, 0) + check_status(status) + return rstatus + + + ############################################################################### # Tensor manipulation - int cudnnCreateTensorDescriptor(TensorDescriptor* descriptor) - int cudnnSetTensor4dDescriptor( - TensorDescriptor tensorDesc, TensorFormat format, - DataType dataType, int n, int c, int h, int w) - int cudnnSetTensor4dDescriptorEx( - TensorDescriptor tensorDesc, DataType dataType, - int n, int c, int h, int w, - int nStride, int cStride, int hStride, int wStride) - int cudnnGetTensor4dDescriptor( - TensorDescriptor tensorDesc, DataType* dataType, - int* n, int* c, int* h, int* w, - int* nStride, int* cStride, int* hStride, int* wStride) - int cudnnSetTensorNdDescriptor( - TensorDescriptor tensorDesc, DataType dataType, int nbDims, - int* dimA, int* strideA) - int cudnnDestroyTensorDescriptor(TensorDescriptor tensorDesc) - int cudnnAddTensor_v3( - Handle handle, void* alpha, TensorDescriptor bDesc, - void* b, void* beta, TensorDescriptor yDesc, void* y) - + ############################################################################### + #TODO miopenSetNdTensorDescriptorWithLayout + cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, + size_t dimA, size_t strideA): + status = cudnnSetTensorNdDescriptor( + tensorDesc, dataType, nbDims, + dimA, strideA) + check_status(status) + + #TODO miopenOpTensor + cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, + size_t b, size_t beta, size_t yDesc, size_t y): + _setStream(handle) + with nogil: + status = cudnnAddTensor_v3( + handle, alpha, bDesc, + b, beta, yDesc, y) + check_status(status) + + + ############################################################################### # Tensor operations - int cudnnCreateOpTensorDescriptor(OpTensorDescriptor* opTensorDesc) - int cudnnSetOpTensorDescriptor( - OpTensorDescriptor opTensorDesc, OpTensorOp opTensorOp, - DataType opTensorCompType, NanPropagation opTensorNanOpt) - int cudnnGetOpTensorDescriptor( - OpTensorDescriptor opTensorDesc, OpTensorOp* opTensorOp, - DataType* opTensorCompType, NanPropagation* opTensorNanOpt) - int cudnnDestroyOpTensorDescriptor(OpTensorDescriptor opTensorDesc) - int cudnnOpTensor( - Handle handle, OpTensorDescriptor opTensorDesc, void* alpha1, - TensorDescriptor aDesc, void* A, void* alpha2, - TensorDescriptor bDesc, void* B, void* beta, - TensorDescriptor cDesc, void* C) - + ############################################################################### + + cpdef size_t createOpTensorDescriptor() except? 0: + cdef OpTensorDescriptor opTensorDesc + status = cudnnCreateOpTensorDescriptor(&opTensorDesc) + check_status(status) + return opTensorDesc + + + cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, + int opTensorCompType, int opTensorNanOpt): + status = cudnnSetOpTensorDescriptor( + opTensorDesc, opTensorOp, + opTensorCompType, opTensorNanOpt) + check_status(status) + + + cpdef getOpTensorDescriptor(size_t opTensorDesc): + cdef OpTensorOp opTensorOp + cdef DataType opTensorCompType + cdef NanPropagation opTensorNanOpt + status = cudnnGetOpTensorDescriptor( + opTensorDesc, &opTensorOp, &opTensorCompType, + &opTensorNanOpt) + check_status(status) + return opTensorOp, opTensorCompType, opTensorNanOpt + + + cpdef destroyOpTensorDescriptor(size_t opTensorDesc): + status = cudnnDestroyOpTensorDescriptor(opTensorDesc) + check_status(status) + + + cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, + size_t aDesc, size_t A, size_t alpha2, size_t bDesc, + size_t B, size_t beta, size_t cDesc, size_t C): + _setStream(handle) + with nogil: + status = cudnnOpTensor( + handle, opTensorDesc, alpha1, + aDesc, A, alpha2, + bDesc, B, beta, + cDesc, C) + check_status(status) + + + ############################################################################### # Tensor reductions - int cudnnCreateReduceTensorDescriptor( - ReduceTensorDescriptor* reduceTensorDesc) - int cudnnSetReduceTensorDescriptor( - ReduceTensorDescriptor reduceTensorDesc, ReduceTensorOp reduceTensorOp, - DataType reduceTensorCompType, NanPropagation reduceTensorNanOpt, - ReduceTensorIndices reduceTensorIndices, - IndicesType reduceTensorIndicesType) - int cudnnGetReduceTensorDescriptor( - ReduceTensorDescriptor reduceTensorDesc, - ReduceTensorOp* reduceTensorOp, DataType* reduceTensorCompType, - NanPropagation* reduceTensorNanOpt, - ReduceTensorIndices* reduceTensorIndices, - IndicesType* reduceTensorIndicesType) - int cudnnDestroyReduceTensorDescriptor( - ReduceTensorDescriptor reduceTensorDesc) - int cudnnGetReductionIndicesSize( - Handle handle, ReduceTensorDescriptor reduceTensorDesc, - TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) - int cudnnGetReductionWorkspaceSize( - Handle handle, ReduceTensorDescriptor reduceTensorDesc, - TensorDescriptor aDesc, TensorDescriptor cDesc, size_t* sizeInBytes) - int cudnnReduceTensor( - Handle handle, ReduceTensorDescriptor reduceTensorDesc, void* indices, - size_t indicesSizeInBytes, void* workspace, - size_t workspaceSizeInBytes, void* alpha, TensorDescriptor aDesc, - void* A, void* beta, TensorDescriptor cDesc, void* c) - int cudnnSetTensor( - Handle handle, TensorDescriptor yDesc, void* y, void* valuePtr) - int cudnnScaleTensor( - Handle handle, TensorDescriptor yDesc, void* y, void* alpha) - + ############################################################################### + + cpdef size_t createReduceTensorDescriptor() except? 0: + cdef ReduceTensorDescriptor reduceTensorDesc + status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) + check_status(status) + return reduceTensorDesc + + cpdef setReduceTensorDescriptor( + size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, + int reduceTensorNanOpt, int reduceTensorIndices, + int reduceTensorIndicesType): + status = cudnnSetReduceTensorDescriptor( + reduceTensorDesc, + reduceTensorOp, + reduceTensorCompType, reduceTensorNanOpt, + reduceTensorIndices, + reduceTensorIndicesType) + check_status(status) + + + cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): + cdef ReduceTensorOp redOp + cdef DataType redCompType + cdef NanPropagation redNanOpt + cdef ReduceTensorIndices redIndices + cdef IndicesType redIndicesType + status = cudnnGetReduceTensorDescriptor( + reduceTensorDesc, &redOp, + &redCompType, &redNanOpt, &redIndices, &redIndicesType) + check_status(status) + return redOp, redCompType, redNanOpt, redIndices, redIndicesType + + + cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): + status = cudnnDestroyReduceTensorDescriptor( + reduceTensorDesc) + check_status(status) + + + cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, + size_t aDesc, size_t cDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetReductionIndicesSize( + handle, reduceTensorDesc, + aDesc, cDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef size_t getReductionWorkspaceSize(intptr_t handle, + size_t reduceTensorDesc, + size_t aDesc, size_t cDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetReductionWorkspaceSize( + handle, reduceTensorDesc, + aDesc, cDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, + size_t indicesSizeInBytes, size_t workspace, + size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, + size_t A, size_t beta, size_t cDesc, size_t C): + _setStream(handle) + with nogil: + status = cudnnReduceTensor( + handle, reduceTensorDesc, + indices, indicesSizeInBytes, workspace, + workspaceSizeInBytes, alpha, aDesc, + A, beta, cDesc, C) + check_status(status) + + + cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): + _setStream(handle) + with nogil: + status = cudnnSetTensor( + handle, yDesc, y, + valuePtr) + check_status(status) + + + cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): + _setStream(handle) + with nogil: + status = cudnnScaleTensor( + handle, yDesc, y, + alpha) + check_status(status) + + + ############################################################################### # Filter manipulation - int cudnnCreateFilterDescriptor(FilterDescriptor* filterDesc) - int cudnnSetFilter4dDescriptor_v4( - FilterDescriptor filterDesc, DataType dataType, - TensorFormat format, int k, int c, int h, int w) - int cudnnSetFilterNdDescriptor_v4( - FilterDescriptor filterDesc, DataType dataType, - TensorFormat format, int nbDims, const int filterDimA[]) - int cudnnGetFilterNdDescriptor_v4( - FilterDescriptor wDesc, int nbDimsRequested, DataType* dataType, - TensorFormat* format, int* nbDims, int filterDimA[]) - int cudnnDestroyFilterDescriptor(FilterDescriptor filterDesc) - + ############################################################################### + + cpdef size_t createFilterDescriptor() except? 0: + cdef FilterDescriptor desc + status = cudnnCreateFilterDescriptor(&desc) + check_status(status) + return desc + + + cpdef setFilter4dDescriptor_v4( + size_t filterDesc, int dataType, + int format, int k, int c, int h, int w): + status = cudnnSetFilter4dDescriptor_v4( + filterDesc, dataType, + format, k, c, h, w) + check_status(status) + + + cpdef setFilterNdDescriptor_v4( + size_t filterDesc, int dataType, + int format, int nbDims, size_t filterDimA): + status = cudnnSetFilterNdDescriptor_v4( + filterDesc, dataType, + format, nbDims, filterDimA) + check_status(status) + + + cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested): + cdef DataType dataType + cdef TensorFormat format + cdef int nbDims + cdef vector.vector[int] filterDimA + filterDimA.resize(nbDimsRequested) + + status = cudnnGetFilterNdDescriptor_v4( + wDesc, nbDimsRequested, &dataType, + &format, &nbDims, filterDimA.data()) + check_status(status) + return dataType, format, nbDims, tuple(filterDimA) + + + cpdef destroyFilterDescriptor(size_t filterDesc): + status = cudnnDestroyFilterDescriptor(filterDesc) + check_status(status) + + + ############################################################################### # Convolution - int cudnnCreateConvolutionDescriptor(ConvolutionDescriptor* convDesc) - int cudnnSetConvolutionMathType( - ConvolutionDescriptor convDesc, MathType mathType) - int cudnnGetConvolutionMathType( - ConvolutionDescriptor convDesc, MathType *mathType) - int cudnnSetConvolutionGroupCount( - ConvolutionDescriptor convDesc, int groupCount) - int cudnnGetConvolutionGroupCount( - ConvolutionDescriptor convDesc, int *groupCount) - int cudnnSetConvolution2dDescriptor_v4( - ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, - int v, int dilation_h, int dilation_w, ConvolutionMode mode) - int cudnnSetConvolution2dDescriptor_v5( - ConvolutionDescriptor convDesc, int pad_h, int pad_w, int u, - int v, int dilation_h, int dilation_w, ConvolutionMode mode, - DataType computeType) - int cudnnSetConvolutionNdDescriptor_v3( - ConvolutionDescriptor convDesc, int arrayLength, int* padA, - int* filterStrideA, int* dilationA, ConvolutionMode mode, - DataType dataType) - int cudnnDestroyConvolutionDescriptor(ConvolutionDescriptor conDesc) - int cudnnFindConvolutionForwardAlgorithm( - Handle handle, TensorDescriptor xDesc, FilterDescriptor wDesc, - ConvolutionDescriptor convDesc, TensorDescriptor yDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionFwdAlgoPerf* perfResults) - int cudnnFindConvolutionForwardAlgorithmEx( - Handle handle, TensorDescriptor xDesc, void* x, - FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, - TensorDescriptor yDesc, void* y, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionForwardAlgorithmEx_v7( - Handle handle, TensorDescriptor xDesc, void* x, - FilterDescriptor wDesc, void* w, ConvolutionDescriptor convDesc, - TensorDescriptor yDesc, void* y, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionForwardAlgorithm_v6( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, ConvolutionFwdPreference preference, - size_t memoryLimitInbytes, ConvolutionFwdAlgo* algo) - int cudnnGetConvolutionForwardAlgorithm_v7( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionFwdAlgoPerf_v7* perfResults) - int cudnnGetConvolutionForwardWorkspaceSize( - Handle handle, TensorDescriptor srcDesc, - FilterDescriptor filterDesc, ConvolutionDescriptor convDesc, - TensorDescriptor destDesc, ConvolutionFwdAlgo algo, - size_t* sizeInBytes) - int cudnnConvolutionForward( - Handle handle, void* alpha, TensorDescriptor srcDesc, - void* srcData, FilterDescriptor filterDesc, void* filterData, - ConvolutionDescriptor convDesc, ConvolutionFwdAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - TensorDescriptor destDesc, void* destData) - int cudnnConvolutionBackwardBias( - Handle handle, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor destDesc, void* destData) - int cudnnFindConvolutionBackwardFilterAlgorithm( - Handle handle, TensorDescriptor xDesc, TensorDescriptor dyDesc, - ConvolutionDescriptor convDesc, FilterDescriptor dwDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdFilterAlgoPerf* perfResults) - int cudnnFindConvolutionBackwardFilterAlgorithmEx( - Handle handle, TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( - Handle handle, TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - FilterDescriptor dwDesc, void* dw, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdFilterAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionBackwardFilterAlgorithm_v6( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, - ConvolutionBwdFilterPreference preference, - size_t memoryLimitInbytes, ConvolutionBwdFilterAlgo* algo) - int cudnnGetConvolutionBackwardFilterAlgorithm_v7( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdFilterAlgoPerf_v7* perfResults) - int cudnnGetConvolutionBackwardFilterWorkspaceSize( - Handle handle, TensorDescriptor srcDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor filterDesc, - ConvolutionBwdFilterAlgo algo, size_t* sizeInBytes) - int cudnnConvolutionBackwardFilter_v3( - Handle handle, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor diffDesc, void* diffData, - ConvolutionDescriptor convDesc, ConvolutionBwdFilterAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - FilterDescriptor gradDesc, void* gradData) - int cudnnGetConvolutionBackwardDataAlgorithm_v6( - Handle handle, FilterDescriptor filterDesc, - TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, - ConvolutionBwdDataPreference preference, - size_t memoryLimitInbytes, ConvolutionBwdDataAlgo* algo) - int cudnnGetConvolutionBackwardDataAlgorithm_v7( - Handle handle, TensorDescriptor filterDesc, TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, FilterDescriptor gradDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdDataAlgoPerf_v7* perfResults) - int cudnnFindConvolutionBackwardDataAlgorithm( - Handle handle, TensorDescriptor wDesc, TensorDescriptor dyDesc, - ConvolutionDescriptor convDesc, FilterDescriptor dxDesc, - int requestedAlgoCount, int* returnedAlgoCount, - ConvolutionBwdDataAlgoPerf* perfResults) - int cudnnFindConvolutionBackwardDataAlgorithmEx( - Handle handle, FilterDescriptor wDesc, void* w, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnFindConvolutionBackwardDataAlgorithmEx_v7( - Handle handle, FilterDescriptor wDesc, void* w, - TensorDescriptor dyDesc, void* dy, ConvolutionDescriptor convDesc, - TensorDescriptor dxDesc, void* dx, int requestedAlgoCount, - int* returnedAlgoCount, ConvolutionBwdDataAlgoPerf_v7* perfResults, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnGetConvolutionBackwardDataWorkspaceSize( - Handle handle, FilterDescriptor filterDesc, - TensorDescriptor diffDesc, - ConvolutionDescriptor convDesc, TensorDescriptor gradDesc, - ConvolutionBwdDataAlgo algo, size_t* sizeInBytes) - int cudnnConvolutionBackwardData_v3( - Handle handle, void* alpha, - FilterDescriptor filterDesc, void* filterData, - TensorDescriptor diffDesc, void* diffData, - ConvolutionDescriptor convDesc, ConvolutionBwdDataAlgo algo, - void* workSpace, size_t workSpaceSizeInBytes, void* beta, - TensorDescriptor gradDesc, void* gradData) - + ############################################################################### + + cpdef size_t createConvolutionDescriptor() except? 0: + cdef ConvolutionDescriptor desc + status = cudnnCreateConvolutionDescriptor(&desc) + check_status(status) + return desc + + + cpdef setConvolutionMathType(size_t convDesc, size_t mathType): + status = cudnnSetConvolutionMathType( + convDesc, mathType) + check_status(status) + + + cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: + cdef MathType mathType + status = cudnnGetConvolutionMathType( + convDesc, &mathType) + check_status(status) + return mathType + + + cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): + status = cudnnSetConvolutionGroupCount( + convDesc, groupCount) + check_status(status) + + + cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: + cdef int groupCount + status = cudnnGetConvolutionGroupCount( + convDesc, &groupCount) + check_status(status) + return groupCount + + + cpdef setConvolution2dDescriptor_v4( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode): + status = cudnnSetConvolution2dDescriptor_v4( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode) + check_status(status) + + + cpdef setConvolution2dDescriptor_v5( + size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, + int dilation_w, int mode, size_t computeType): + status = cudnnSetConvolution2dDescriptor_v5( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode, computeType) + check_status(status) + + + cpdef setConvolutionNdDescriptor_v3( + size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, + size_t dilationA, int mode, int dataType): + status = cudnnSetConvolutionNdDescriptor_v3( + convDesc, arrayLength, padA, + filterStrideA, dilationA, mode, + dataType) + check_status(status) + + + cpdef destroyConvolutionDescriptor(size_t convDesc): + status = cudnnDestroyConvolutionDescriptor( + convDesc) + check_status(status) + + + cpdef findConvolutionForwardAlgorithm( + intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, + size_t yDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithm( + handle, xDesc, wDesc, + convDesc, yDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + + cpdef list findConvolutionForwardAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithmEx( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + + cpdef list findConvolutionForwardAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, + size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionForwardAlgorithmEx_v7( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + cpdef int getConvolutionForwardAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1: + cdef ConvolutionFwdAlgo algo + status = cudnnGetConvolutionForwardAlgorithm_v6( + handle, srcDesc, + filterDesc, convDesc, + destDesc, preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + cpdef list getConvolutionForwardAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionForwardAlgorithm_v7( + handle, srcDesc, + filterDesc, convDesc, + destDesc, requestedAlgoCount, + &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, + size_t destDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionForwardWorkspaceSize( + handle, srcDesc, + filterDesc, convDesc, + destDesc, algo, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef convolutionForward( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t filterDesc, size_t filterData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t destDesc, size_t destData): + _setStream(handle) + with nogil: + status = cudnnConvolutionForward( + handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + destDesc, destData) + check_status(status) + + + cpdef convolutionBackwardBias( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t beta, size_t destDesc, size_t destData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardBias( + handle, alpha, + srcDesc, srcData, beta, + destDesc, destData) + check_status(status) + + + cpdef findConvolutionBackwardFilterAlgorithm( + intptr_t handle, size_t xDesc, size_t dyDesc, size_t convDesc, + size_t dwDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithm( + handle, xDesc, dyDesc, + convDesc, dwDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + cpdef list findConvolutionBackwardFilterAlgorithmEx( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithmEx( + handle, xDesc, x, + dyDesc, dy, convDesc, + dwDesc, dw, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + + cpdef list findConvolutionBackwardFilterAlgorithmEx_v7( + intptr_t handle, size_t xDesc, size_t x, size_t dyDesc, size_t dy, + size_t convDesc, size_t dwDesc, size_t dw, int requestedAlgoCount, + size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardFilterAlgorithmEx_v7( + handle, xDesc, x, + dyDesc, dy, convDesc, + dwDesc, dw, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + cpdef int getConvolutionBackwardFilterAlgorithm_v6( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int preference, + size_t memoryLimitInbytes) except? -1: + cdef ConvolutionBwdFilterAlgo algo + status = cudnnGetConvolutionBackwardFilterAlgorithm_v6( + handle, srcDesc, + diffDesc, convDesc, + filterDesc, + preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + cpdef list getConvolutionBackwardFilterAlgorithm_v7( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdFilterAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionBackwardFilterAlgorithm_v7( + handle, srcDesc, diffDesc, + convDesc, gradDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + cpdef Py_ssize_t getConvolutionBackwardFilterWorkspaceSize( + intptr_t handle, size_t srcDesc, size_t diffDesc, size_t convDesc, + size_t filterDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionBackwardFilterWorkspaceSize( + handle, srcDesc, + diffDesc, convDesc, + filterDesc, algo, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef convolutionBackwardFilter_v3( + intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardFilter_v3( + handle, alpha, + srcDesc, srcData, + diffDesc, diffData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + gradDesc, gradData) + check_status(status) + + + cpdef findConvolutionBackwardDataAlgorithm( + intptr_t handle, size_t wDesc, size_t dyDesc, size_t convDesc, + size_t dxDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithm( + handle, wDesc, dyDesc, + convDesc, dxDesc, + requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return perfResults + + cpdef list findConvolutionBackwardDataAlgorithmEx( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdDataAlgoPerf] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithmEx( + handle, wDesc, w, + dyDesc, dy, convDesc, + dxDesc, dx, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) + for p in perfResults] + + + cpdef list findConvolutionBackwardDataAlgorithmEx_v7( + intptr_t handle, size_t wDesc, size_t w, size_t dyDesc, size_t dy, + size_t convDesc, size_t dxDesc, size_t dx, + int requestedAlgoCount, size_t workSpace, size_t workSpaceSizeInBytes): + cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnFindConvolutionBackwardDataAlgorithmEx_v7( + handle, wDesc, w, + dyDesc, dy, convDesc, + dxDesc, dx, + requestedAlgoCount, &returnedAlgoCount, perfResults.data(), + workSpace, workSpaceSizeInBytes) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + cpdef int getConvolutionBackwardDataAlgorithm_v6( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, size_t preference, + size_t memoryLimitInbytes) except? -1: + cdef ConvolutionBwdDataAlgo algo + status = cudnnGetConvolutionBackwardDataAlgorithm_v6( + handle, filterDesc, + diffDesc, convDesc, + gradDesc, preference, + memoryLimitInbytes, &algo) + check_status(status) + return algo + + cpdef list getConvolutionBackwardDataAlgorithm_v7( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int requestedAlgoCount): + cdef vector.vector[ConvolutionBwdDataAlgoPerf_v7] perfResults + cdef int returnedAlgoCount + perfResults.resize(requestedAlgoCount) + status = cudnnGetConvolutionBackwardDataAlgorithm_v7( + handle, filterDesc, + diffDesc, convDesc, + gradDesc, requestedAlgoCount, + &returnedAlgoCount, perfResults.data()) + check_status(status) + perfResults.resize(returnedAlgoCount) + return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, + p.determinism, p.mathType) + for p in perfResults] + + cpdef Py_ssize_t getConvolutionBackwardDataWorkspaceSize( + intptr_t handle, size_t filterDesc, size_t diffDesc, size_t convDesc, + size_t gradDesc, int algo) except? -1: + cdef size_t sizeInBytes + status = cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, filterDesc, + diffDesc, + convDesc, gradDesc, + algo, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef convolutionBackwardData_v3( + intptr_t handle, size_t alpha, size_t filterDesc, size_t filterData, + size_t diffDesc, size_t diffData, size_t convDesc, int algo, + size_t workSpace, size_t workSpaceSizeInBytes, size_t beta, + size_t gradDesc, size_t gradData): + _setStream(handle) + with nogil: + status = cudnnConvolutionBackwardData_v3( + handle, alpha, + filterDesc, filterData, + diffDesc, diffData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + gradDesc, gradData) + check_status(status) + + ############################################################################### # Pooling - int cudnnCreatePoolingDescriptor(PoolingDescriptor* desc) - int cudnnSetPooling2dDescriptor_v4( - PoolingDescriptor poolingDesc, PoolingMode mode, - NanPropagation maxpoolingNanOpt, int windowHeight, int windowWidth, - int verticalPadding, int horizontalPadding, int verticalStride, - int horizontalStride) - int cudnnSetPoolingNdDescriptor_v4( - PoolingDescriptor poolingDesc, PoolingMode mode, - NanPropagation maxpoolingNanOpt, int nbDims, - int* windowDimA, int* paddingA, int* strideA) - int cudnnDestroyPoolingDescriptor(PoolingDescriptor poolingDesc) - int cudnnPoolingForward( - Handle handle, PoolingDescriptor poolingDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor dstDesc, void* dstData) - int cudnnPoolingBackward( - Handle handle, PoolingDescriptor poolingDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, - TensorDescriptor destDesc, void* destData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) - + ############################################################################### + + cpdef size_t createPoolingDescriptor() except? 0: + cdef PoolingDescriptor desc + status = cudnnCreatePoolingDescriptor(&desc) + check_status(status) + return desc + + + cpdef setPooling2dDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, + int windowWidth, int verticalPadding, int horizontalPadding, + int verticalStride, int horizontalStride): + status = cudnnSetPooling2dDescriptor_v4( + poolingDesc, mode, + maxpoolingNanOpt, windowHeight, windowWidth, + verticalPadding, horizontalPadding, verticalStride, horizontalStride) + check_status(status) + + + cpdef setPoolingNdDescriptor_v4( + size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, + size_t windowDimA, size_t paddingA, size_t strideA): + status = cudnnSetPoolingNdDescriptor_v4( + poolingDesc, mode, + maxpoolingNanOpt, nbDims, + windowDimA, paddingA, strideA) + check_status(status) + + + cpdef destroyPoolingDescriptor(size_t poolingDesc): + status = cudnnDestroyPoolingDescriptor(poolingDesc) + check_status(status) + + + cpdef poolingForward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): + _setStream(handle) + with nogil: + status = cudnnPoolingForward( + handle, poolingDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + check_status(status) + + + cpdef poolingBackward( + intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData): + _setStream(handle) + with nogil: + status = cudnnPoolingBackward( + handle, poolingDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) + check_status(status) + + ############################################################################### # Batch Normalization - int cudnnDeriveBNTensorDescriptor( - TensorDescriptor derivedBnDesc, TensorDescriptor xDesc, - BatchNormMode mode) - int cudnnBatchNormalizationForwardTraining( - Handle handle, BatchNormMode mode, - void* alpha, void* beta, TensorDescriptor xDesc, - void* x, TensorDescriptor yDesc, void* y, - TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, - void* bnBias, double exponentialAverageFactor, - void* resultRunningMean, void* resultRunningVariance, - double epsilon, void* resultSaveMean, - void* resultSaveInvVariance) - int cudnnBatchNormalizationForwardInference( - Handle handle, BatchNormMode mode, - void* alpha, void* beta, TensorDescriptor xDesc, - void* x, TensorDescriptor yDesc, void* y, - TensorDescriptor bnScaleBiasMeanVarDesc, void* bnScale, - void* bnBias, void* estimatedMean, void* estimatedVariance, - double epsilon) - int cudnnBatchNormalizationBackward( - Handle handle, BatchNormMode mode, - void* alphaDataDiff, void* betaDataDiff, - void* alphaParamDiff, void* betaParamDiff, - TensorDescriptor xDesc, void* x, - TensorDescriptor dyDesc, void* dy, - TensorDescriptor dxDesc, void* dx, - TensorDescriptor dBnScaleBiasDesc, void* bnScale, - void* dBnScaleResult, void* dBnBiasResult, - double epsilon, void* savedMean, void* savedInvVariance) - - int cudnnBatchNormalizationForwardTrainingEx( - Handle handle, - BatchNormMode mode, BatchNormOps bnOps, - void* alpha, void* beta, - TensorDescriptor xDesc, void* x, - TensorDescriptor zDesc, void* z, - TensorDescriptor yDesc, void* y, - TensorDescriptor bnScaleBiasMeanVarDesc, - void* bnScale, void* bnBias, - double exponentialAverageFactor, - void* resultRunningMean, void* resultRunningVariance, - double epsilon, - void* resultSaveMean, void* resultSaveInvVariance, - ActivationDescriptor activationDesc, - void* workspace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - Handle handle, - BatchNormMode mode, BatchNormOps bnOps, - TensorDescriptor xDesc, - TensorDescriptor zDesc, - TensorDescriptor yDesc, - TensorDescriptor bnScaleBiasMeanVarDesc, - ActivationDescriptor activationDesc, - size_t* sizeInBytes) - int cudnnBatchNormalizationBackwardEx( - Handle handle, - BatchNormMode mode, BatchNormOps bnops, - void* alphaDataDiff, void* betaDataDiff, - void* alphaParamDiff, void* betaParamDiff, - TensorDescriptor xDesc, void* x, - TensorDescriptor yDesc, void* y, - TensorDescriptor dyDesc, void* dy, - TensorDescriptor dzDesc, void* dz, - TensorDescriptor dxDesc, void* dx, - TensorDescriptor dBnScaleBiasDesc, - void* bnScaleData, void* bnBiasData, - void* dBnScaleData, void* dBnBiasData, - double epsilon, - void* savedMean, void* savedInvVariance, - ActivationDescriptor activationDesc, - void* workspace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnGetBatchNormalizationBackwardExWorkspaceSize( - Handle handle, - BatchNormMode mode, - BatchNormOps bnOps, - TensorDescriptor xDesc, - TensorDescriptor yDesc, - TensorDescriptor dyDesc, - TensorDescriptor dzDesc, - TensorDescriptor dxDesc, - TensorDescriptor dBnScaleBiasDesc, - ActivationDescriptor activationDesc, - size_t* sizeInBytes) - int cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - Handle handle, - BatchNormMode mode, - BatchNormOps bnOps, - ActivationDescriptor activationDesc, - TensorDescriptor xDesc, - size_t* sizeInBytes) - - # Activation - int cudnnCreateActivationDescriptor( - ActivationDescriptor* activationDesc) - int cudnnSetActivationDescriptor( - ActivationDescriptor activationDesc, ActivationMode mode, - NanPropagation reluNanOpt, double reluCeiling) - int cudnnDestroyActivationDescriptor( - ActivationDescriptor activationDesc) - int cudnnSoftmaxForward( - Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, - void* alpha, TensorDescriptor srcDesc, void* srcData, - void* beta, TensorDescriptor dstDesc, void* dstData) - int cudnnSoftmaxBackward( - Handle handle, SoftmaxAlgorithm algorithm, SoftmaxMode mode, - void* alpha, TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) - int cudnnActivationForward_v4( - Handle handle, ActivationDescriptor activationDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, void* beta, - TensorDescriptor dstDesc, void* dstData) - int cudnnActivationBackward_v4( - Handle handle, ActivationDescriptor activationDesc, void* alpha, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor srcDiffDesc, void* srcDiffData, - TensorDescriptor destDesc, void* destData, void* beta, - TensorDescriptor destDiffDesc, void* destDiffData) - + ############################################################################### + + CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON + + cpdef deriveBNTensorDescriptor( + size_t derivedBnDesc, size_t xDesc, int mode): + status = cudnnDeriveBNTensorDescriptor( + derivedBnDesc, xDesc, + mode) + check_status(status) + + + cpdef batchNormalizationForwardTraining( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardTraining( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance) + check_status(status) + + + cpdef batchNormalizationForwardInference( + intptr_t handle, int mode, + size_t alpha, size_t beta, size_t xDesc, + size_t x, size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, size_t bnScale, + size_t bnBias, size_t estimatedMean, size_t estimatedVariance, + double epsilon): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardInference( + handle, mode, + alpha, beta, xDesc, + x, yDesc, y, + bnScaleBiasMeanVarDesc, bnScale, + bnBias, estimatedMean, estimatedVariance, + epsilon) + check_status(status) + + + cpdef batchNormalizationBackward( + intptr_t handle, int mode, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, size_t dyDesc, + size_t dy, size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, size_t bnScale, + size_t dBnScaleResult, size_t dBnBiasResult, + double epsilon, size_t savedMean, size_t savedInvVariance): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationBackward( + handle, mode, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + dyDesc, dy, + dxDesc, dx, + dBnScaleBiasDesc, bnScale, + dBnScaleResult, dBnBiasResult, + epsilon, savedMean, savedInvVariance) + check_status(status) + + + cpdef batchNormalizationForwardTrainingEx( + intptr_t handle, int mode, int bnOps, + size_t alpha, size_t beta, + size_t xDesc, size_t x, + size_t zDesc, size_t z, + size_t yDesc, size_t y, + size_t bnScaleBiasMeanVarDesc, + size_t bnScale, size_t bnBias, + double exponentialAverageFactor, + size_t resultRunningMean, size_t resultRunningVariance, + double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationForwardTrainingEx( + handle, mode, bnOps, + alpha, beta, + xDesc, x, + zDesc, z, + yDesc, y, + bnScaleBiasMeanVarDesc, + bnScale, bnBias, + exponentialAverageFactor, + resultRunningMean, resultRunningVariance, + epsilon, resultSaveMean, resultSaveInvVariance, + activationDesc, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t zDesc, + size_t yDesc, + size_t bnScaleBiasMeanVarDesc, + size_t activationDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + handle, + mode, bnOps, + xDesc, + zDesc, + yDesc, + bnScaleBiasMeanVarDesc, + activationDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef batchNormalizationBackwardEx( + intptr_t handle, int mode, int bnops, + size_t alphaDataDiff, size_t betaDataDiff, + size_t alphaParamDiff, size_t betaParamDiff, + size_t xDesc, size_t x, + size_t yDesc, size_t y, + size_t dyDesc, size_t dy, + size_t dzDesc, size_t dz, + size_t dxDesc, size_t dx, + size_t dBnScaleBiasDesc, + size_t bnScaleData, size_t bnBiasData, + size_t dBnScaleData, size_t dBnBiasData, + double epsilon, + size_t savedMean, size_t savedInvVariance, + size_t activationDesc, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnBatchNormalizationBackwardEx( + handle, + mode, bnops, + alphaDataDiff, betaDataDiff, + alphaParamDiff, betaParamDiff, + xDesc, x, + yDesc, y, + dyDesc, dy, + dzDesc, dz, + dxDesc, dx, + dBnScaleBiasDesc, + bnScaleData, bnBiasData, + dBnScaleData, dBnBiasData, + epsilon, + savedMean, savedInvVariance, + activationDesc, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( + intptr_t handle, int mode, int bnOps, + size_t xDesc, + size_t yDesc, + size_t dyDesc, + size_t dzDesc, + size_t dxDesc, + size_t dBnScaleBiasDesc, + size_t activationDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationBackwardExWorkspaceSize( + handle, + mode, + bnOps, + xDesc, + yDesc, + dyDesc, + dzDesc, + dxDesc, + dBnScaleBiasDesc, + activationDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( + intptr_t handle, int mode, int bnOps, + size_t activationDesc, + size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + handle, + mode, + bnOps, + activationDesc, + xDesc, + &sizeInBytes) + check_status(status) + return sizeInBytes + + + ############################################################################### # Dropout - int cudnnCreateDropoutDescriptor(DropoutDescriptor* desc) - int cudnnDestroyDropoutDescriptor(DropoutDescriptor dropoutDesc) - int cudnnDropoutGetStatesSize(Handle handle, size_t* sizeInBytes) - int cudnnDropoutGetReserveSpaceSize( - TensorDescriptor xDesc, size_t* sizeInBytes) - int cudnnSetDropoutDescriptor( - DropoutDescriptor dropoutDesc, Handle handle, float dropout, - void* states, size_t stateSizeInBytes, unsigned long long seed) - int cudnnDropoutForward( - Handle handle, DropoutDescriptor dropoutDesc, - TensorDescriptor srcDesc, void* srcData, - TensorDescriptor dstDesc, void* dstData, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnDropoutBackward( - Handle handle, DropoutDescriptor dropoutDesc, - TensorDescriptor dydesc, void* dy, TensorDescriptor dxdesc, - void* dx, void* reserveSpace, size_t reserveSpaceSizeInBytes) - + ############################################################################### + + cpdef size_t createDropoutDescriptor() except? 0: + cdef DropoutDescriptor desc + status = cudnnCreateDropoutDescriptor(&desc) + check_status(status) + return desc + + + cpdef destroyDropoutDescriptor(size_t dropoutDesc): + status = cudnnDestroyDropoutDescriptor(dropoutDesc) + check_status(status) + + + cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: + cdef size_t sizeInBytes + status = cudnnDropoutGetStatesSize( + handle, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef setDropoutDescriptor( + size_t dropoutDesc, intptr_t handle, float dropout, + size_t states, size_t stateSizeInBytes, unsigned long long seed): + status = cudnnSetDropoutDescriptor( + dropoutDesc, handle, dropout, + states, stateSizeInBytes, seed) + check_status(status) + + + cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnDropoutGetReserveSpaceSize( + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef dropoutForward( + intptr_t handle, size_t dropoutDesc, + size_t srcDesc, size_t srcData, + size_t dstDesc, size_t dstData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnDropoutForward( + handle, dropoutDesc, + srcDesc, srcData, + dstDesc, dstData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef dropoutBackward( + intptr_t handle, size_t dropoutDesc, + size_t dyDesc, size_t dyData, + size_t dxDesc, size_t dxData, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnDropoutBackward( + handle, dropoutDesc, + dyDesc, dyData, + dxDesc, dxData, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + ############################################################################### # CTC - int cudnnCreateCTCLossDescriptor(CTCLossDescriptor* ctcLossDesc) - int cudnnDestroyCTCLossDescriptor(CTCLossDescriptor ctcLossDesc) - int cudnnSetCTCLossDescriptor( - CTCLossDescriptor ctcLossDesc, DataType dataType) - int cudnnGetCTCLossDescriptor( - CTCLossDescriptor ctcLossDesc, DataType* dataType) - int cudnnGetCTCLossWorkspaceSize( - Handle handle, TensorDescriptor probsDesc, - TensorDescriptor gradientsDesc, int* labels, - int* labelLengths, int* inputLengths, CTCLossAlgo algo, - CTCLossDescriptor ctcLossDesc, size_t* sizeInBytes) - int cudnnCTCLoss( - Handle handle, TensorDescriptor probsDesc, - void* probs, int* labels, int* labelLengths, int* inputLengths, - void* costs, TensorDescriptor gradientsDesc, void* gradients, - CTCLossAlgo algo, CTCLossDescriptor ctcLossDesc, - void* workspace, size_t workSpaceSizeInBytes) + ############################################################################### + cpdef size_t createCTCLossDescriptor() except? 0: + cdef CTCLossDescriptor desc + status = cudnnCreateCTCLossDescriptor(&desc) + check_status(status) + return desc + + cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): + status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) + check_status(status) + + cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): + status = cudnnSetCTCLossDescriptor( + ctcLossDesc, dataType) + check_status(status) + + cpdef getCTCLossDescriptor(size_t ctcLossDesc): + cdef DataType compType + status = cudnnGetCTCLossDescriptor( + ctcLossDesc, &compType) + check_status(status) + return compType + + cpdef size_t getCTCLossWorkspaceSize( + intptr_t handle, size_t probsDesc, size_t gradientsDesc, + size_t labels, size_t labelLengths, size_t inputLengths, + int algo, size_t ctcLossDesc) except? 0: + cdef size_t sizeInBytes + status = cudnnGetCTCLossWorkspaceSize( + handle, probsDesc, + gradientsDesc, + labels, labelLengths, inputLengths, + algo, ctcLossDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + cpdef CTCLoss( + intptr_t handle, size_t probsDesc, + size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, + size_t costs, size_t gradientsDesc, size_t gradients, + int algo, size_t ctcLossDesc, + size_t workspace, size_t workSpaceSizeInBytes): + status = cudnnCTCLoss( + handle, probsDesc, probs, + labels, labelLengths, inputLengths, + costs, gradientsDesc, gradients, + algo, ctcLossDesc, + workspace, workSpaceSizeInBytes) + check_status(status) + + + ############################################################################### # RNN - int cudnnCreateRNNDescriptor(RNNDescriptor* rnnDesc) - int cudnnDestroyRNNDescriptor(RNNDescriptor rnnDesc) - int cudnnCreatePersistentRNNPlan( - RNNDescriptor rnnDesc, - const int minibatch, DataType dataType, - PersistentRNNPlan* plan) - int cudnnSetPersistentRNNPlan( - RNNDescriptor rnnDesc, PersistentRNNPlan plan) - int cudnnDestroyPersistentRNNPlan(PersistentRNNPlan plan) - int cudnnSetRNNDescriptor_v5( - RNNDescriptor rnnDesc, int hiddenSize, - int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, - DirectionMode direction, RNNMode mode, DataType dataType) - int cudnnSetRNNDescriptor_v6( - Handle handle, RNNDescriptor rnnDesc, int hiddenSize, - int numLayers, DropoutDescriptor dropoutDesc, RNNInputMode inputMode, - DirectionMode direction, RNNMode mode, RNNAlgo algo, DataType dataType) - int cudnnSetRNNPaddingMode( - RNNDescriptor rnnDesc, RNNPaddingMode paddingMode) - int cudnnGetRNNPaddingMode( - RNNDescriptor rnnDesc, RNNPaddingMode* paddingMode) - int cudnnCreateRNNDataDescriptor(RNNDataDescriptor* RNNDataDesc) - int cudnnDestroyRNNDataDescriptor(RNNDataDescriptor RNNDataDesc) - int cudnnSetRNNDataDescriptor( - RNNDataDescriptor RNNDataDesc, DataType dataType, RNNDataLayout layout, - int maxSeqLength, int batchSize, int vectorSize, - const int seqLengthArray[], void *paddingFill) - int cudnnGetRNNDataDescriptor( - RNNDataDescriptor RNNDataDesc, DataType* dataType, - RNNDataLayout* layout, int* maxSeqLength, int* batchSize, - int* vectorSize, int arrayLengthRequested, int seqLengthArray[], - void* paddingFill) - int cudnnGetRNNWorkspaceSize( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, size_t* sizeInBytes) - int cudnnGetRNNTrainingReserveSize( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, size_t* sizeInBytes) - int cudnnGetRNNParamsSize( - Handle handle, RNNDescriptor rnnDesc, TensorDescriptor xDesc, - size_t* sizeInBytes, DataType dataType) - int cudnnGetRNNLinLayerMatrixParams( - Handle handle, RNNDescriptor rnnDesc, int layer, - TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, - int linLayerID, FilterDescriptor linLayerMatDesc, - void** linLayerMat) - int cudnnGetRNNLinLayerBiasParams( - Handle handle, RNNDescriptor rnnDesc, int layer, - TensorDescriptor xDesc, FilterDescriptor wDesc, void* w, - int linLayerID, FilterDescriptor linLayerBiasDesc, - void** linLayerBias) - int cudnnRNNForwardInference( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, - void* x, TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, - void* cx, FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, - void* y, TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, - void* cy, void* workspace, size_t workSpaceSizeInBytes) - int cudnnRNNForwardTraining( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, void* x, - TensorDescriptor hxDesc, void* hx, TensorDescriptor cxDesc, void* cx, - FilterDescriptor wDesc, void* w, TensorDescriptor* yDesc, void* y, - TensorDescriptor hyDesc, void* hy, TensorDescriptor cyDesc, void* cy, - void* workspace, size_t workSpaceSizeInBytes, void* reserveSpace, - size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardData( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* yDesc, void* y, - TensorDescriptor* dyDesc, void* dy, - TensorDescriptor dhyDesc, void* dhy, - TensorDescriptor dcyDesc, void* dcy, - FilterDescriptor wDesc, void* w, - TensorDescriptor hxDesc, void* hx, - TensorDescriptor cxDesc, void* cx, - TensorDescriptor* dxDesc, void* dx, - TensorDescriptor dhxDesc, void* dhx, - TensorDescriptor dcxDesc, void* dcx, void* workspace, - size_t workSpaceSizeInBytes, void* reserveSpace, - size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardWeights( - Handle handle, RNNDescriptor rnnDesc, int seqLength, - TensorDescriptor* xDesc, void* x, TensorDescriptor hxDesc, void* hx, - TensorDescriptor* yDesc, void* y, - void* workspace, size_t workSpaceSizeInBytes, FilterDescriptor dwDesc, - void* dw, void* reserveSpace, size_t reserveSpaceSizeInBytes) - - int cudnnRNNForwardInferenceEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - FilterDescriptor wDesc, const void* w, - RNNDataDescriptor yDesc, void* y, - TensorDescriptor hyDesc, void* hy, - TensorDescriptor cyDesc, void* cy, - RNNDataDescriptor kDesc, const void* keys, - RNNDataDescriptor cDesc, void* cAttn, - RNNDataDescriptor iDesc, void* iAttn, - RNNDataDescriptor qDesc, void* queries, - void* workSpace, size_t workSpaceSizeInBytes) - int cudnnRNNForwardTrainingEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - FilterDescriptor wDesc, const void* w, - RNNDataDescriptor yDesc, void* y, - TensorDescriptor hyDesc, void* hy, - TensorDescriptor cyDesc, void* cy, - RNNDataDescriptor kDesc, const void* keys, - RNNDataDescriptor cDesc, void* cAttn, - RNNDataDescriptor iDesc, void* iAttn, - RNNDataDescriptor qDesc, void* queries, - void* workSpace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardDataEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor yDesc, const void* y, - RNNDataDescriptor dyDesc, const void* dy, - RNNDataDescriptor dcDesc, const void* dcAttn, - TensorDescriptor dhyDesc, const void* dhy, - TensorDescriptor dcyDesc, const void* dcy, - FilterDescriptor wDesc, const void* w, - TensorDescriptor hxDesc, const void* hx, - TensorDescriptor cxDesc, const void* cx, - RNNDataDescriptor dxDesc, void* dx, - TensorDescriptor dhxDesc, void* dhx, - TensorDescriptor dcxDesc, void* dcx, - RNNDataDescriptor dkDesc, void* dkeys, - void* workSpace, size_t workSpaceSizeInBytes, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - int cudnnRNNBackwardWeightsEx( - Handle handle, RNNDescriptor rnnDesc, - RNNDataDescriptor xDesc, const void* x, - TensorDescriptor hxDesc, const void* hx, - RNNDataDescriptor yDesc, const void* y, - void* workSpace, size_t workSpaceSizeInBytes, - FilterDescriptor dwDesc, void* dw, - void* reserveSpace, size_t reserveSpaceSizeInBytes) - + ############################################################################### + + cpdef size_t createRNNDescriptor() except? 0: + cdef RNNDescriptor desc + status = cudnnCreateRNNDescriptor(&desc) + check_status(status) + return desc + + + cpdef destroyRNNDescriptor(size_t rnnDesc): + status = cudnnDestroyRNNDescriptor(rnnDesc) + check_status(status) + + + cpdef size_t createPersistentRNNPlan(size_t rnnDesc, int minibatch, + int dataType) except? 0: + cdef PersistentRNNPlan plan + status = cudnnCreatePersistentRNNPlan( + rnnDesc, + minibatch, dataType, &plan) + check_status(status) + return plan + + + cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan): + status = cudnnSetPersistentRNNPlan( + rnnDesc, plan) + check_status(status) + + + cpdef destroyPersistentRNNPlan(size_t plan): + status = cudnnDestroyPersistentRNNPlan(plan) + check_status(status) + + + cpdef setRNNDescriptor_v5( + size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int dataType): + status = cudnnSetRNNDescriptor_v5( + rnnDesc, hiddenSize, numLayers, + dropoutDesc, inputMode, + direction, mode, dataType) + check_status(status) + + + cpdef setRNNDescriptor_v6( + intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, + size_t dropoutDesc, int inputMode, int direction, int mode, + int algo, int dataType): + status = cudnnSetRNNDescriptor_v6( + handle, rnnDesc, hiddenSize, numLayers, + dropoutDesc, inputMode, + direction, mode, algo, + dataType) + check_status(status) + + + cpdef setRNNPaddingMode( + size_t rnnDesc, int paddingMode): + status = cudnnSetRNNPaddingMode( + rnnDesc, paddingMode) + check_status(status) + + + cpdef getRNNPaddingMode(size_t rnnDesc): + cdef RNNPaddingMode paddingMode + status = cudnnGetRNNPaddingMode( + rnnDesc, &paddingMode) + check_status(status) + return paddingMode + + + cpdef size_t createRNNDataDescriptor() except? 0: + cdef RNNDataDescriptor desc + status = cudnnCreateRNNDataDescriptor(&desc) + check_status(status) + return desc + + + cpdef destroyRNNDataDescriptor(size_t RNNDataDesc): + status = cudnnDestroyRNNDataDescriptor(RNNDataDesc) + check_status(status) + + + cpdef setRNNDataDescriptor( + size_t RNNDataDesc, int dataType, size_t layout, + int maxSeqLength, int batchSize, int vectorSize, + size_t seqLengthArray, size_t paddingFill): + status = cudnnSetRNNDataDescriptor( + RNNDataDesc, dataType, + layout, maxSeqLength, batchSize, vectorSize, + seqLengthArray, paddingFill) + check_status(status) + + + cpdef getRNNDataDescriptor( + size_t RNNDataDesc, size_t dataType, + size_t layout, size_t maxSeqLength, size_t batchSize, + size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, + size_t paddingFill): + status = cudnnGetRNNDataDescriptor( + RNNDataDesc, dataType, + layout, maxSeqLength, batchSize, + vectorSize, arrayLengthRequested, seqLengthArray, + paddingFill) + check_status(status) + + + cpdef getRNNWorkspaceSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): + cdef size_t sizeInBytes + status = cudnnGetRNNWorkspaceSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef getRNNTrainingReserveSize( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): + cdef size_t sizeInBytes + status = cudnnGetRNNTrainingReserveSize( + handle, rnnDesc, seqLength, + xDesc, &sizeInBytes) + check_status(status) + return sizeInBytes + + + cpdef getRNNParamsSize( + intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): + cdef size_t sizeInBytes + status = cudnnGetRNNParamsSize( + handle, rnnDesc, xDesc, + &sizeInBytes, dataType) + check_status(status) + return sizeInBytes + + + cpdef getRNNLinLayerMatrixParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat): + status = cudnnGetRNNLinLayerMatrixParams( + handle, rnnDesc, layer, + xDesc, wDesc, w, + linLayerID, linLayerMatDesc, linLayerMat) + check_status(status) + + + cpdef getRNNLinLayerBiasParams( + intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, + size_t w, int linLayerID, size_t linLayerBiasDesc, + size_t linLayerBias): + status = cudnnGetRNNLinLayerBiasParams( + handle, rnnDesc, layer, + xDesc, wDesc, w, + linLayerID, linLayerBiasDesc, linLayerBias) + check_status(status) + + + cpdef RNNForwardInference( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, + size_t x, size_t hxDesc, size_t hx, size_t cxDesc, + size_t cx, size_t wDesc, size_t w, size_t yDesc, + size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t workspace, size_t workSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardInference( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes) + check_status(status) + + + cpdef RNNForwardTraining( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t wDesc, size_t w, size_t yDesc, size_t y, + size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, + size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardTraining( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardData( + intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, + size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, + size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, + size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, + size_t dcxDesc, size_t dcx, size_t workspace, + size_t workSpaceSizeInBytes, size_t reserveSpace, + size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardData( + handle, rnnDesc, seqLength, + yDesc, y, + dyDesc, dy, + dhyDesc, dhy, + dcyDesc, dcy, + wDesc, w, + hxDesc, hx, + cxDesc, cx, + dxDesc, dx, + dhxDesc, dhx, + dcxDesc, dcx, + workspace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardWeights( + intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, + size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardWeights( + handle, rnnDesc, seqLength, + xDesc, x, + hxDesc, hx, + yDesc, y, + workspace, workSpaceSizeInBytes, + dwDesc, dw, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNForwardInferenceEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardInferenceEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + kDesc, keys, + cDesc, cAttn, + iDesc, iAttn, + qDesc, queries, + workSpace, workSpaceSizeInBytes) + check_status(status) + + + cpdef RNNForwardTrainingEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, + size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, + size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, + size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNForwardTrainingEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + cxDesc, cx, + wDesc, w, + yDesc, y, + hyDesc, hy, + cyDesc, cy, + kDesc, keys, + cDesc, cAttn, + iDesc, iAttn, + qDesc, queries, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardDataEx( + intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, + size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, + size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, + size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, + size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, + size_t dkDesc, size_t dkeys, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardDataEx( + handle, rnnDesc, + yDesc, y, + dyDesc, dy, + dcDesc, dcAttn, + dhyDesc, dhy, + dcyDesc, dcy, + wDesc, w, + hxDesc, hx, + cxDesc, cx, + dxDesc, dx, + dhxDesc, dhx, + dcxDesc, dcx, + dkDesc, dkeys, + workSpace, workSpaceSizeInBytes, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + cpdef RNNBackwardWeightsEx( + intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, + size_t hxDesc, size_t hx, size_t yDesc, size_t y, + size_t workSpace, size_t workSpaceSizeInBytes, + size_t dwDesc, size_t dw, + size_t reserveSpace, size_t reserveSpaceSizeInBytes): + _setStream(handle) + with nogil: + status = cudnnRNNBackwardWeightsEx( + handle, rnnDesc, + xDesc, x, + hxDesc, hx, + yDesc, y, + workSpace, workSpaceSizeInBytes, + dwDesc, dw, + reserveSpace, reserveSpaceSizeInBytes) + check_status(status) + + + ############################################################################### # Spatial Transformer - int cudnnCreateSpatialTransformerDescriptor( - SpatialTransformerDescriptor* stDesc) - int cudnnDestroySpatialTransformerDescriptor( - SpatialTransformerDescriptor stDesc) - int cudnnSetSpatialTransformerNdDescriptor( - SpatialTransformerDescriptor stDesc, SamplerType samplerType, - DataType dataType, int nbDims, int dimA[]) - int cudnnSpatialTfGridGeneratorForward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* theta, void* grid) - int cudnnSpatialTfGridGeneratorBackward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* dgrid, void* dtheta) - int cudnnSpatialTfSamplerForward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* alpha, TensorDescriptor xDesc, void* x, - void* grid, void* beta, TensorDescriptor yDesc, void* y) - int cudnnSpatialTfSamplerBackward( - Handle handle, SpatialTransformerDescriptor stDesc, - void* alpha, TensorDescriptor xDesc, void* x, void* beta, - TensorDescriptor dxDesc, void* dx, void* alphaDgrid, - TensorDescriptor dyDesc, void* dy, void* grid, - void* betaDgrid, void* dgrid) - + ############################################################################### + + cpdef size_t createSpatialTransformerDescriptor() except? 0: + cdef SpatialTransformerDescriptor stDesc + status = cudnnCreateSpatialTransformerDescriptor(&stDesc) + check_status(status) + return stDesc + + + cpdef destroySpatialTransformerDescriptor(size_t stDesc): + status = cudnnDestroySpatialTransformerDescriptor( + stDesc) + check_status(status) + + + cpdef setSpatialTransformerDescriptor( + size_t stDesc, size_t samplerType, int dataType, + int nbDims, size_t dimA): + status = cudnnSetSpatialTransformerNdDescriptor( + stDesc, samplerType, + dataType, nbDims, dimA) + check_status(status) + + + cpdef spatialTfGridGeneratorForward( + intptr_t handle, size_t stDesc, size_t theta, size_t grid): + _setStream(handle) + with nogil: + status = cudnnSpatialTfGridGeneratorForward( + handle, stDesc, + theta, grid) + check_status(status) + + + cpdef spatialTfGridGeneratorBackward( + intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta): + _setStream(handle) + with nogil: + status = cudnnSpatialTfGridGeneratorBackward( + handle, stDesc, + dgrid, dtheta) + check_status(status) + + + cpdef spatialTfSamplerForward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t grid, size_t beta, size_t yDesc, size_t y): + _setStream(handle) + with nogil: + status = cudnnSpatialTfSamplerForward( + handle, stDesc, + alpha, xDesc, x, grid, + beta, yDesc, y) + check_status(status) + + + cpdef spatialTfSamplerBackward( + intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, + size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, + size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid): + _setStream(handle) + with nogil: + status = cudnnSpatialTfSamplerBackward( + handle, stDesc, + alpha, xDesc, x, beta, + dxDesc, dx, alphaDgrid, + dyDesc, dy, grid, + betaDgrid, dgrid) + check_status(status) + + ############################################################################### # Fused Ops - int cudnnCreateFusedOpsConstParamPack( - FusedOpsConstParamPack* constPack, int ops) - int cudnnDestroyFusedOpsConstParamPack(FusedOpsConstParamPack constPack) - int cudnnSetFusedOpsConstParamPackAttribute( - FusedOpsConstParamPack constPack, FusedOpsConstParamLabel paramLabel, - const void *param) - int cudnnGetFusedOpsConstParamPackAttribute( - const FusedOpsConstParamPack constPack, - FusedOpsConstParamLabel paramLabel, void *param, int *isNULL) - int cudnnCreateFusedOpsVariantParamPack( - FusedOpsVariantParamPack *varPack, FusedOps ops) - int cudnnDestroyFusedOpsVariantParamPack(FusedOpsVariantParamPack varPack) - int cudnnSetFusedOpsVariantParamPackAttribute( - FusedOpsVariantParamPack varPack, FusedOpsVariantParamLabel paramLabel, - void *ptr) - int cudnnGetFusedOpsVariantParamPackAttribute( - const FusedOpsVariantParamPack varPack, - FusedOpsVariantParamLabel paramLabel, void *ptr) - int cudnnCreateFusedOpsPlan(FusedOpsPlan *plan, FusedOps ops) - int cudnnDestroyFusedOpsPlan(FusedOpsPlan plan) - int cudnnMakeFusedOpsPlan( - Handle handle, FusedOpsPlan plan, - const FusedOpsConstParamPack constPack, size_t *workspaceSizeInBytes) - int cudnnFusedOpsExecute( - Handle handle, const FusedOpsPlan plan, - FusedOpsVariantParamPack varPack) - - # Build-time version - int CUDNN_VERSION - - # Constants - double _CUDNN_BN_MIN_EPSILON 'CUDNN_BN_MIN_EPSILON' - + ############################################################################### + + cpdef createFusedOpsConstParamPack(int ops): + cdef FusedOpsConstParamPack constPack + with nogil: + status = cudnnCreateFusedOpsConstParamPack(&constPack, ops) + check_status(status) + return constPack + + cpdef destroyFusedOpsConstParamPack(size_t constPack): + with nogil: + status = cudnnDestroyFusedOpsConstParamPack( + constPack) + check_status(status) + + cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param): + with nogil: + status = cudnnSetFusedOpsConstParamPackAttribute( + constPack, + paramLabel, param) + check_status(status) + + cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, + size_t param): + cdef int isNULL = 0 + with nogil: + status = cudnnGetFusedOpsConstParamPackAttribute( + constPack, + paramLabel, param, &isNULL) + check_status(status) + return isNULL + + cpdef createFusedOpsVariantParamPack(int ops): + cdef FusedOpsVariantParamPack varPack + with nogil: + status = cudnnCreateFusedOpsVariantParamPack(&varPack, ops) + check_status(status) + return varPack + + cpdef destroyFusedOpsVariantParamPack(size_t varPack): + with nogil: + status = cudnnDestroyFusedOpsVariantParamPack( + varPack) + check_status(status) + + cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr): + with nogil: + status = cudnnSetFusedOpsVariantParamPackAttribute( + varPack, + paramLabel, ptr) + check_status(status) + + cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, + size_t ptr): + with nogil: + status = cudnnGetFusedOpsVariantParamPackAttribute( + varPack, + paramLabel, ptr) + check_status(status) + + cpdef createFusedOpsPlan(int ops): + cdef FusedOpsPlan plan + with nogil: + status = cudnnCreateFusedOpsPlan(&plan, ops) + check_status(status) + return plan + + cpdef destroyFusedOpsPlan(size_t plan): + with nogil: + status = cudnnDestroyFusedOpsPlan(plan) + check_status(status) + + cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack): + cdef size_t workspaceSizeInBytes + _setStream(handle) + with nogil: + status = cudnnMakeFusedOpsPlan(handle, plan, + constPack, + &workspaceSizeInBytes) + check_status(status) + return workspaceSizeInBytes + + cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack): + _setStream(handle) + with nogil: + status = cudnnFusedOpsExecute(handle, plan, + varPack) + check_status(status) cdef class CuDNNAlgoPerf: @@ -748,9 +2606,7 @@ cdef class CuDNNAlgoPerf: self.memory = memory self.determinism = determinism self.mathType = mathType - - -############################################################################### +############################################################################ # Error handling ############################################################################### @@ -758,7 +2614,10 @@ class CuDNNError(RuntimeError): def __init__(self, int status): self.status = status - msg = cudnnGetErrorString(status) + IF CUPY_HIP_VERSION != 0: + msg = miopenGetErrorString(status) + ELSE: + msg = cudnnGetErrorString(status) super(CuDNNError, self).__init__( 'cuDNN Error: {}'.format(msg.decode())) self._infos = [] @@ -791,7 +2650,10 @@ cpdef inline check_status(int status): ############################################################################### def get_build_version(): - return CUDNN_VERSION + IF CUPY_HIP_VERSION != 0: + return CUPY_HIP_VERSION + ELSE: + return CUDNN_VERSION ############################################################################### @@ -799,20 +2661,24 @@ def get_build_version(): ############################################################################### cpdef size_t getVersion() except? 0: - return cudnnGetVersion() - - -############################################################################### -# Runtime error checking -############################################################################### - -cpdef queryRuntimeError(intptr_t handle, int mode): - cdef Status rstatus - with nogil: - status = cudnnQueryRuntimeError(handle, &rstatus, - mode, 0) - check_status(status) - return rstatus + IF CUPY_HIP_VERSION != 0: + return CUPY_HIP_VERSION + ELSE: + return cudnnGetVersion() + + +IF CUPY_HIP_VERSION == 0: + ############################################################################### + # Runtime error checking + ############################################################################### + + cpdef queryRuntimeError(intptr_t handle, int mode): + cdef Status rstatus + with nogil: + status = cudnnQueryRuntimeError(handle, &rstatus, + mode, 0) + check_status(status) + return rstatus ############################################################################### @@ -822,16 +2688,22 @@ cpdef queryRuntimeError(intptr_t handle, int mode): cpdef intptr_t create() except? 0: cdef Handle handle with nogil: - status = cudnnCreate(&handle) + IF CUPY_HIP_VERSION != 0: + status = miopenCreate(&handle) + ELSE: + status = cudnnCreate(&handle) check_status(status) return handle cpdef destroy(intptr_t handle): with nogil: - status = cudnnDestroy(handle) + IF CUPY_HIP_VERSION != 0: + status = miopenDestroy(handle) + ELSE: + status = cudnnDestroy(handle) check_status(status) - + cpdef setStream(intptr_t handle, size_t stream): # TODO(leofang): The support of stream capture is not mentioned at all in @@ -840,14 +2712,20 @@ cpdef setStream(intptr_t handle, size_t stream): raise NotImplementedError( 'calling cuDNN API during stream capture is currently ' 'unsupported') - - status = cudnnSetStream(handle, stream) + IF CUPY_HIP_VERSION != 0: + status = miopenSetStream(handle, stream) + ELSE: + status = cudnnSetStream(handle, stream) check_status(status) cpdef size_t getStream(intptr_t handle) except? 0: - cdef driver.Stream stream - status = cudnnGetStream(handle, &stream) + IF CUPY_HIP_VERSION != 0: + cdef Stream stream + status = miopenGetStream(handle, &stream) + ELSE: + cdef driver.Stream stream + status = cudnnGetStream(handle, &stream) check_status(status) return stream @@ -856,218 +2734,195 @@ cdef _setStream(intptr_t handle): """Set current stream""" setStream(handle, stream_module.get_current_stream_ptr()) + ############################################################################### # Tensor manipulation ############################################################################### cpdef size_t createTensorDescriptor() except? 0: cdef TensorDescriptor descriptor - status = cudnnCreateTensorDescriptor(&descriptor) + IF CUPY_HIP_VERSION != 0: + status = miopenCreateTensorDescriptor(&descriptor) + ELSE: + status = cudnnCreateTensorDescriptor(&descriptor) check_status(status) return descriptor cpdef setTensor4dDescriptor(size_t tensorDesc, int format, int dataType, int n, int c, int h, int w): - status = cudnnSetTensor4dDescriptor( - tensorDesc, format, - dataType, n, c, h, w) + IF CUPY_HIP_VERSION != 0: + status = miopenSet4dTensorDescriptor( + tensorDesc, + dataType, n, c, h, w) + ELSE: + status = cudnnSetTensor4dDescriptor( + tensorDesc, format, + dataType, n, c, h, w) check_status(status) cpdef setTensor4dDescriptorEx(size_t tensorDesc, int dataType, int n, int c, int h, int w, int nStride, int cStride, int hStride, int wStride): - status = cudnnSetTensor4dDescriptorEx( - tensorDesc, dataType, n, c, h, w, - nStride, cStride, hStride, wStride) + IF CUPY_HIP_VERSION != 0: + status = miopenSet4dTensorDescriptorEx( + tensorDesc, dataType, n, c, h, w, + nStride, cStride, hStride, wStride) + ELSE: + status = cudnnSetTensor4dDescriptorEx( + tensorDesc, dataType, n, c, h, w, + nStride, cStride, hStride, wStride) check_status(status) cpdef tuple getTensor4dDescriptor(size_t tensorDesc): cdef DataType dataType cdef int n, c, h, w, nStride, cStride, hStride, wStride - status = cudnnGetTensor4dDescriptor( - tensorDesc, &dataType, - &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) + IF CUPY_HIP_VERSION != 0: + status = miopenGet4dTensorDescriptor( + tensorDesc, &dataType, + &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) + ELSE: + status = cudnnGetTensor4dDescriptor( + tensorDesc, &dataType, + &n, &c, &h, &w, &nStride, &cStride, &hStride, &wStride) check_status(status) return dataType, n, c, h, w, nStride, cStride, hStride, wStride -cpdef setTensorNdDescriptor(size_t tensorDesc, int dataType, int nbDims, - size_t dimA, size_t strideA): - status = cudnnSetTensorNdDescriptor( - tensorDesc, dataType, nbDims, - dimA, strideA) - check_status(status) - - cpdef destroyTensorDescriptor(size_t tensorDesc): - status = cudnnDestroyTensorDescriptor(tensorDesc) + IF CUPY_HIP_VERSION != 0: + status = miopenDestroyTensorDescriptor(tensorDesc) + ELSE: + status = cudnnDestroyTensorDescriptor(tensorDesc) check_status(status) - -cpdef addTensor_v3(intptr_t handle, size_t alpha, size_t bDesc, - size_t b, size_t beta, size_t yDesc, size_t y): - _setStream(handle) - with nogil: - status = cudnnAddTensor_v3( - handle, alpha, bDesc, - b, beta, yDesc, y) - check_status(status) - - ############################################################################### -# Tensor operations +# Activation ############################################################################### -cpdef size_t createOpTensorDescriptor() except? 0: - cdef OpTensorDescriptor opTensorDesc - status = cudnnCreateOpTensorDescriptor(&opTensorDesc) - check_status(status) - return opTensorDesc - - -cpdef setOpTensorDescriptor(size_t opTensorDesc, int opTensorOp, - int opTensorCompType, int opTensorNanOpt): - status = cudnnSetOpTensorDescriptor( - opTensorDesc, opTensorOp, - opTensorCompType, opTensorNanOpt) +cpdef size_t createActivationDescriptor() except? 0: + cdef ActivationDescriptor activationDesc + IF CUPY_HIP_VERSION != 0: + status = miopenCreateActivationDescriptor(&activationDesc) + ELSE: + status = cudnnCreateActivationDescriptor(&activationDesc) check_status(status) + return activationDesc -cpdef getOpTensorDescriptor(size_t opTensorDesc): - cdef OpTensorOp opTensorOp - cdef DataType opTensorCompType - cdef NanPropagation opTensorNanOpt - status = cudnnGetOpTensorDescriptor( - opTensorDesc, &opTensorOp, &opTensorCompType, - &opTensorNanOpt) +cpdef setActivationDescriptor( + size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): + IF CUPY_HIP_VERSION != 0: + status = miopenSetActivationDescriptor( + activationDesc, mode, 1.0, 0.0, 0.0) + ELSE: + status = cudnnSetActivationDescriptor( + activationDesc, mode, + reluNanOpt, reluCeiling) check_status(status) - return opTensorOp, opTensorCompType, opTensorNanOpt -cpdef destroyOpTensorDescriptor(size_t opTensorDesc): - status = cudnnDestroyOpTensorDescriptor(opTensorDesc) +cpdef destroyActivationDescriptor(size_t activationDesc): + IF CUPY_HIP_VERSION != 0: + status = miopenDestroyActivationDescriptor( + activationDesc) + ELSE: + status = cudnnDestroyActivationDescriptor( + activationDesc) check_status(status) -cpdef opTensor(intptr_t handle, size_t opTensorDesc, size_t alpha1, - size_t aDesc, size_t A, size_t alpha2, size_t bDesc, - size_t B, size_t beta, size_t cDesc, size_t C): +cpdef softmaxForward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): _setStream(handle) with nogil: - status = cudnnOpTensor( - handle, opTensorDesc, alpha1, - aDesc, A, alpha2, - bDesc, B, beta, - cDesc, C) - check_status(status) - - -############################################################################### -# Tensor reductions -############################################################################### - -cpdef size_t createReduceTensorDescriptor() except? 0: - cdef ReduceTensorDescriptor reduceTensorDesc - status = cudnnCreateReduceTensorDescriptor(&reduceTensorDesc) - check_status(status) - return reduceTensorDesc - -cpdef setReduceTensorDescriptor( - size_t reduceTensorDesc, int reduceTensorOp, int reduceTensorCompType, - int reduceTensorNanOpt, int reduceTensorIndices, - int reduceTensorIndicesType): - status = cudnnSetReduceTensorDescriptor( - reduceTensorDesc, - reduceTensorOp, - reduceTensorCompType, reduceTensorNanOpt, - reduceTensorIndices, - reduceTensorIndicesType) - check_status(status) - - -cpdef getReduceTensorDescriptor(size_t reduceTensorDesc): - cdef ReduceTensorOp redOp - cdef DataType redCompType - cdef NanPropagation redNanOpt - cdef ReduceTensorIndices redIndices - cdef IndicesType redIndicesType - status = cudnnGetReduceTensorDescriptor( - reduceTensorDesc, &redOp, - &redCompType, &redNanOpt, &redIndices, &redIndicesType) - check_status(status) - return redOp, redCompType, redNanOpt, redIndices, redIndicesType - - -cpdef destroyReduceTensorDescriptor(size_t reduceTensorDesc): - status = cudnnDestroyReduceTensorDescriptor( - reduceTensorDesc) - check_status(status) - - -cpdef size_t getReductionIndicesSize(intptr_t handle, size_t reduceTensorDesc, - size_t aDesc, size_t cDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetReductionIndicesSize( - handle, reduceTensorDesc, - aDesc, cDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef size_t getReductionWorkspaceSize(intptr_t handle, - size_t reduceTensorDesc, - size_t aDesc, size_t cDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetReductionWorkspaceSize( - handle, reduceTensorDesc, - aDesc, cDesc, - &sizeInBytes) + IF CUPY_HIP_VERSION != 0: + status = miopenSoftmaxForward( + handle, alpha, srcDesc, srcData, + beta, dstDesc, dstData) + ELSE: + status = cudnnSoftmaxForward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + beta, dstDesc, dstData) check_status(status) - return sizeInBytes -cpdef reduceTensor(intptr_t handle, size_t reduceTensorDesc, size_t indices, - size_t indicesSizeInBytes, size_t workspace, - size_t workspaceSizeInBytes, size_t alpha, size_t aDesc, - size_t A, size_t beta, size_t cDesc, size_t C): +cpdef softmaxBackward( + intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, + size_t destDiffDesc, size_t destDiffData): _setStream(handle) with nogil: - status = cudnnReduceTensor( - handle, reduceTensorDesc, - indices, indicesSizeInBytes, workspace, - workspaceSizeInBytes, alpha, aDesc, - A, beta, cDesc, C) + IF CUPY_HIP_VERSION != 0: + status = miopenSoftmaxBackward( + handle, alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) + ELSE: + status = cudnnSoftmaxBackward( + handle, algorithm, mode, + alpha, srcDesc, srcData, + srcDiffDesc, srcDiffData, beta, + destDiffDesc, destDiffData) check_status(status) -cpdef setTensor(intptr_t handle, size_t yDesc, size_t y, size_t valuePtr): +cpdef activationForward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t beta, size_t dstDesc, size_t dstData): _setStream(handle) with nogil: - status = cudnnSetTensor( - handle, yDesc, y, - valuePtr) + IF CUPY_HIP_VERSION != 0: + status = miopenActivationForward( + handle, activationDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) + ELSE: + status = cudnnActivationForward_v4( + handle, activationDesc, alpha, + srcDesc, srcData, beta, + dstDesc, dstData) check_status(status) -cpdef scaleTensor(intptr_t handle, size_t yDesc, size_t y, size_t alpha): +cpdef activationBackward_v4( + intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, + size_t srcData, size_t srcDiffDesc, size_t srcDiffData, + size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, + size_t destDiffData): _setStream(handle) with nogil: - status = cudnnScaleTensor( - handle, yDesc, y, - alpha) + IF CUPY_HIP_VERSION != 0: + status = miopenActivationBackward( + handle, activationDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) + ELSE: + status = cudnnActivationBackward_v4( + handle, activationDesc, alpha, + srcDesc, srcData, + srcDiffDesc, srcDiffData, + destDesc, destData, beta, + destDiffDesc, destDiffData) check_status(status) - ############################################################################### # Filter manipulation ############################################################################### cpdef size_t createFilterDescriptor() except? 0: - cdef FilterDescriptor desc - status = cudnnCreateFilterDescriptor(&desc) + IF CUPY_HIP_VERSION != 0: + cdef TensorDescriptor desc + status = miopenCreateTensorDescriptor(&desc) + ELSE: + cdef FilterDescriptor desc + status = cudnnCreateFilterDescriptor(&desc) check_status(status) return desc @@ -1075,127 +2930,141 @@ cpdef size_t createFilterDescriptor() except? 0: cpdef setFilter4dDescriptor_v4( size_t filterDesc, int dataType, int format, int k, int c, int h, int w): - status = cudnnSetFilter4dDescriptor_v4( - filterDesc, dataType, - format, k, c, h, w) + IF CUPY_HIP_VERSION != 0: + status = miopenSet4dTensorDescriptor( + filterDesc, dataType, + k, c, h, w) + ELSE: + status = cudnnSetFilter4dDescriptor_v4( + filterDesc, dataType, + format, k, c, h, w) check_status(status) cpdef setFilterNdDescriptor_v4( size_t filterDesc, int dataType, int format, int nbDims, size_t filterDimA): - status = cudnnSetFilterNdDescriptor_v4( - filterDesc, dataType, - format, nbDims, filterDimA) + IF CUPY_HIP_VERSION != 0: + status = miopenSetTensorDescriptor( + filterDesc, dataType, + nbDims, filterDimA, NULL) #TODO miopenSetTensorDescriptor takes stride as input now set to NULL, confirm the value of stride + ELSE: + status = cudnnSetFilterNdDescriptor_v4( + filterDesc, dataType, + format, nbDims, filterDimA) check_status(status) - +""" cpdef getFilterNdDescriptor(size_t wDesc, int nbDimsRequested): cdef DataType dataType cdef TensorFormat format cdef int nbDims cdef vector.vector[int] filterDimA filterDimA.resize(nbDimsRequested) - - status = cudnnGetFilterNdDescriptor_v4( - wDesc, nbDimsRequested, &dataType, - &format, &nbDims, filterDimA.data()) + IF CUPY_HIP_VERSION != 0: + status = miopenGetTensorDescriptor( + wDesc, &dataType, + &nbDims, filterDimA.data(), NULL) + ELSE: + status = cudnnGetFilterNdDescriptor_v4( + wDesc, nbDimsRequested, &dataType, + &format, &nbDims, filterDimA.data()) check_status(status) return dataType, format, nbDims, tuple(filterDimA) - +""" cpdef destroyFilterDescriptor(size_t filterDesc): - status = cudnnDestroyFilterDescriptor(filterDesc) + IF CUPY_HIP_VERSION != 0: + status = miopenDestroyTensorDescriptor(filterDesc) + ELSE: + status = cudnnDestroyFilterDescriptor(filterDesc) check_status(status) - ############################################################################### # Convolution ############################################################################### cpdef size_t createConvolutionDescriptor() except? 0: cdef ConvolutionDescriptor desc - status = cudnnCreateConvolutionDescriptor(&desc) + IF CUPY_HIP_VERSION != 0: + status = miopenCreateConvolutionDescriptor(&desc) + ELSE: + status = cudnnCreateConvolutionDescriptor(&desc) check_status(status) return desc - -cpdef setConvolutionMathType(size_t convDesc, size_t mathType): - status = cudnnSetConvolutionMathType( - convDesc, mathType) - check_status(status) - - -cpdef size_t getConvolutionMathType(size_t convDesc) except? 0: - cdef MathType mathType - status = cudnnGetConvolutionMathType( - convDesc, &mathType) - check_status(status) - return mathType - - cpdef setConvolutionGroupCount(size_t convDesc, int groupCount): - status = cudnnSetConvolutionGroupCount( - convDesc, groupCount) + IF CUPY_HIP_VERSION != 0: + status = miopenSetConvolutionGroupCount( + convDesc, groupCount) + ELSE: + status = cudnnSetConvolutionGroupCount( + convDesc, groupCount) check_status(status) - +""" cpdef int getConvolutionGroupCount(size_t convDesc) except? -1: cdef int groupCount - status = cudnnGetConvolutionGroupCount( - convDesc, &groupCount) + IF CUPY_HIP_VERSION != 0: + status = miopenGetConvolutionGroupCount( + convDesc, &groupCount) + ELSE: + status = cudnnGetConvolutionGroupCount( + convDesc, &groupCount) check_status(status) return groupCount - +""" cpdef setConvolution2dDescriptor_v4( size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, int dilation_w, int mode): - status = cudnnSetConvolution2dDescriptor_v4( - convDesc, pad_h, pad_w, u, v, dilation_h, - dilation_w, mode) + IF CUPY_HIP_VERSION != 0: + status = miopenInitConvolutionDescriptor( + convDesc, mode, pad_h, pad_w, + u, v, dilation_h,dilation_w) + ELSE: + status = cudnnSetConvolution2dDescriptor_v4( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode) check_status(status) cpdef setConvolution2dDescriptor_v5( size_t convDesc, int pad_h, int pad_w, int u, int v, int dilation_h, int dilation_w, int mode, size_t computeType): - status = cudnnSetConvolution2dDescriptor_v5( - convDesc, pad_h, pad_w, u, v, dilation_h, - dilation_w, mode, computeType) + IF CUPY_HIP_VERSION != 0: + status = miopenInitConvolutionDescriptor( + convDesc, mode, pad_h, pad_w, + u, v, dilation_h,dilation_w) + ELSE: + status = cudnnSetConvolution2dDescriptor_v5( + convDesc, pad_h, pad_w, u, v, dilation_h, + dilation_w, mode, computeType) check_status(status) cpdef setConvolutionNdDescriptor_v3( size_t convDesc, int arrayLength, size_t padA, size_t filterStrideA, size_t dilationA, int mode, int dataType): - status = cudnnSetConvolutionNdDescriptor_v3( - convDesc, arrayLength, padA, - filterStrideA, dilationA, mode, - dataType) + IF CUPY_HIP_VERSION != 0: + status = miopenInitConvolutionNdDescriptor( + convDesc, arrayLength, padA, + filterStrideA, dilationA, mode) + ELSE: + status = cudnnSetConvolutionNdDescriptor_v3( + convDesc, arrayLength, padA, + filterStrideA, dilationA, mode, + dataType) check_status(status) cpdef destroyConvolutionDescriptor(size_t convDesc): - status = cudnnDestroyConvolutionDescriptor( - convDesc) - check_status(status) - - -cpdef findConvolutionForwardAlgorithm( - intptr_t handle, size_t xDesc, size_t wDesc, size_t convDesc, - size_t yDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithm( - handle, xDesc, wDesc, - convDesc, yDesc, - requestedAlgoCount, &returnedAlgoCount, perfResults.data()) + IF CUPY_HIP_VERSION != 0: + status = miopenDestroyConvolutionDescriptor(convDesc) + ELSE: + status = cudnnDestroyConvolutionDescriptor( + convDesc) check_status(status) - perfResults.resize(returnedAlgoCount) - return perfResults - cpdef list findConvolutionForwardAlgorithmEx( intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, @@ -1204,18 +3073,26 @@ cpdef list findConvolutionForwardAlgorithmEx( cdef vector.vector[ConvolutionFwdAlgoPerf] perfResults cdef int returnedAlgoCount perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithmEx( - handle, xDesc, x, - wDesc, w, convDesc, - yDesc, y, requestedAlgoCount, - &returnedAlgoCount, perfResults.data(), workSpace, - workSpaceSizeInBytes) + IF CUPY_HIP_VERSION != 0: + status = miopenFindConvolutionForwardAlgorithm( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes, True) + ELSE: + status = cudnnFindConvolutionForwardAlgorithmEx( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) check_status(status) perfResults.resize(returnedAlgoCount) return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, -1, -1) for p in perfResults] - +""" cpdef list findConvolutionForwardAlgorithmEx_v7( intptr_t handle, size_t xDesc, size_t x, size_t wDesc, size_t w, size_t convDesc, size_t yDesc, size_t y, int requestedAlgoCount, @@ -1223,58 +3100,41 @@ cpdef list findConvolutionForwardAlgorithmEx_v7( cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults cdef int returnedAlgoCount perfResults.resize(requestedAlgoCount) - status = cudnnFindConvolutionForwardAlgorithmEx_v7( - handle, xDesc, x, - wDesc, w, convDesc, - yDesc, y, requestedAlgoCount, - &returnedAlgoCount, perfResults.data(), workSpace, - workSpaceSizeInBytes) - check_status(status) - perfResults.resize(returnedAlgoCount) - return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, - p.determinism, p.mathType) - for p in perfResults] - - -cpdef int getConvolutionForwardAlgorithm_v6( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int preference, size_t memoryLimitInbytes) except? -1: - cdef ConvolutionFwdAlgo algo - status = cudnnGetConvolutionForwardAlgorithm_v6( - handle, srcDesc, - filterDesc, convDesc, - destDesc, preference, - memoryLimitInbytes, &algo) - check_status(status) - return algo - - -cpdef list getConvolutionForwardAlgorithm_v7( - intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, - size_t destDesc, int requestedAlgoCount): - cdef vector.vector[ConvolutionFwdAlgoPerf_v7] perfResults - cdef int returnedAlgoCount - perfResults.resize(requestedAlgoCount) - status = cudnnGetConvolutionForwardAlgorithm_v7( - handle, srcDesc, - filterDesc, convDesc, - destDesc, requestedAlgoCount, - &returnedAlgoCount, perfResults.data()) + IF CUPY_HIP_VERSION != 0: + status = miopenFindConvolutionForwardAlgorithm( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes, true) + ELSE: + status = cudnnFindConvolutionForwardAlgorithmEx_v7( + handle, xDesc, x, + wDesc, w, convDesc, + yDesc, y, requestedAlgoCount, + &returnedAlgoCount, perfResults.data(), workSpace, + workSpaceSizeInBytes) check_status(status) perfResults.resize(returnedAlgoCount) return [CuDNNAlgoPerf(p.algo, p.status, p.time, p.memory, p.determinism, p.mathType) for p in perfResults] - +""" cpdef Py_ssize_t getConvolutionForwardWorkspaceSize( intptr_t handle, size_t srcDesc, size_t filterDesc, size_t convDesc, size_t destDesc, int algo) except? -1: cdef size_t sizeInBytes - status = cudnnGetConvolutionForwardWorkspaceSize( - handle, srcDesc, - filterDesc, convDesc, - destDesc, algo, &sizeInBytes) + IF CUPY_HIP_VERSION != 0: + status = miopenConvolutionForwardGetWorkSpaceSize( + handle, srcDesc, + filterDesc, convDesc, + destDesc, &sizeInBytes) + ELSE: + status = cudnnGetConvolutionForwardWorkspaceSize( + handle, srcDesc, + filterDesc, convDesc, + destDesc, algo, &sizeInBytes) check_status(status) return sizeInBytes @@ -1286,16 +3146,25 @@ cpdef convolutionForward( size_t destDesc, size_t destData): _setStream(handle) with nogil: - status = cudnnConvolutionForward( - handle, alpha, - srcDesc, srcData, - filterDesc, filterData, - convDesc, algo, - workSpace, workSpaceSizeInBytes, beta, - destDesc, destData) - check_status(status) - - + IF CUPY_HIP_VERSION != 0: + status = miopenConvolutionForward( + handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + beta, destDesc, destData, + workSpace, workSpaceSizeInBytes) + ELSE: + status = cudnnConvolutionForward( + handle, alpha, + srcDesc, srcData, + filterDesc, filterData, + convDesc, algo, + workSpace, workSpaceSizeInBytes, beta, + destDesc, destData) + check_status(status) + +""" cpdef convolutionBackwardBias( intptr_t handle, size_t alpha, size_t srcDesc, size_t srcData, size_t beta, size_t destDesc, size_t destData): @@ -1538,1004 +3407,4 @@ cpdef convolutionBackwardData_v3( workSpace, workSpaceSizeInBytes, beta, gradDesc, gradData) check_status(status) - -############################################################################### -# Pooling -############################################################################### - -cpdef size_t createPoolingDescriptor() except? 0: - cdef PoolingDescriptor desc - status = cudnnCreatePoolingDescriptor(&desc) - check_status(status) - return desc - - -cpdef setPooling2dDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int windowHeight, - int windowWidth, int verticalPadding, int horizontalPadding, - int verticalStride, int horizontalStride): - status = cudnnSetPooling2dDescriptor_v4( - poolingDesc, mode, - maxpoolingNanOpt, windowHeight, windowWidth, - verticalPadding, horizontalPadding, verticalStride, horizontalStride) - check_status(status) - - -cpdef setPoolingNdDescriptor_v4( - size_t poolingDesc, int mode, int maxpoolingNanOpt, int nbDims, - size_t windowDimA, size_t paddingA, size_t strideA): - status = cudnnSetPoolingNdDescriptor_v4( - poolingDesc, mode, - maxpoolingNanOpt, nbDims, - windowDimA, paddingA, strideA) - check_status(status) - - -cpdef destroyPoolingDescriptor(size_t poolingDesc): - status = cudnnDestroyPoolingDescriptor(poolingDesc) - check_status(status) - - -cpdef poolingForward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnPoolingForward( - handle, poolingDesc, alpha, - srcDesc, srcData, beta, - dstDesc, dstData) - check_status(status) - - -cpdef poolingBackward( - intptr_t handle, size_t poolingDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnPoolingBackward( - handle, poolingDesc, alpha, - srcDesc, srcData, - srcDiffDesc, srcDiffData, - destDesc, destData, beta, - destDiffDesc, destDiffData) - check_status(status) - -############################################################################### -# Batch Normalization -############################################################################### - -CUDNN_BN_MIN_EPSILON = _CUDNN_BN_MIN_EPSILON - -cpdef deriveBNTensorDescriptor( - size_t derivedBnDesc, size_t xDesc, int mode): - status = cudnnDeriveBNTensorDescriptor( - derivedBnDesc, xDesc, - mode) - check_status(status) - - -cpdef batchNormalizationForwardTraining( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationForwardTraining( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance) - check_status(status) - - -cpdef batchNormalizationForwardInference( - intptr_t handle, int mode, - size_t alpha, size_t beta, size_t xDesc, - size_t x, size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, size_t bnScale, - size_t bnBias, size_t estimatedMean, size_t estimatedVariance, - double epsilon): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationForwardInference( - handle, mode, - alpha, beta, xDesc, - x, yDesc, y, - bnScaleBiasMeanVarDesc, bnScale, - bnBias, estimatedMean, estimatedVariance, - epsilon) - check_status(status) - - -cpdef batchNormalizationBackward( - intptr_t handle, int mode, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, size_t dyDesc, - size_t dy, size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, size_t bnScale, - size_t dBnScaleResult, size_t dBnBiasResult, - double epsilon, size_t savedMean, size_t savedInvVariance): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationBackward( - handle, mode, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - dyDesc, dy, - dxDesc, dx, - dBnScaleBiasDesc, bnScale, - dBnScaleResult, dBnBiasResult, - epsilon, savedMean, savedInvVariance) - check_status(status) - - -cpdef batchNormalizationForwardTrainingEx( - intptr_t handle, int mode, int bnOps, - size_t alpha, size_t beta, - size_t xDesc, size_t x, - size_t zDesc, size_t z, - size_t yDesc, size_t y, - size_t bnScaleBiasMeanVarDesc, - size_t bnScale, size_t bnBias, - double exponentialAverageFactor, - size_t resultRunningMean, size_t resultRunningVariance, - double epsilon, size_t resultSaveMean, size_t resultSaveInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationForwardTrainingEx( - handle, mode, bnOps, - alpha, beta, - xDesc, x, - zDesc, z, - yDesc, y, - bnScaleBiasMeanVarDesc, - bnScale, bnBias, - exponentialAverageFactor, - resultRunningMean, resultRunningVariance, - epsilon, resultSaveMean, resultSaveInvVariance, - activationDesc, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef size_t getBatchNormalizationForwardTrainingExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t zDesc, - size_t yDesc, - size_t bnScaleBiasMeanVarDesc, - size_t activationDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - handle, - mode, bnOps, - xDesc, - zDesc, - yDesc, - bnScaleBiasMeanVarDesc, - activationDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef batchNormalizationBackwardEx( - intptr_t handle, int mode, int bnops, - size_t alphaDataDiff, size_t betaDataDiff, - size_t alphaParamDiff, size_t betaParamDiff, - size_t xDesc, size_t x, - size_t yDesc, size_t y, - size_t dyDesc, size_t dy, - size_t dzDesc, size_t dz, - size_t dxDesc, size_t dx, - size_t dBnScaleBiasDesc, - size_t bnScaleData, size_t bnBiasData, - size_t dBnScaleData, size_t dBnBiasData, - double epsilon, - size_t savedMean, size_t savedInvVariance, - size_t activationDesc, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnBatchNormalizationBackwardEx( - handle, - mode, bnops, - alphaDataDiff, betaDataDiff, - alphaParamDiff, betaParamDiff, - xDesc, x, - yDesc, y, - dyDesc, dy, - dzDesc, dz, - dxDesc, dx, - dBnScaleBiasDesc, - bnScaleData, bnBiasData, - dBnScaleData, dBnBiasData, - epsilon, - savedMean, savedInvVariance, - activationDesc, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef size_t getBatchNormalizationBackwardExWorkspaceSize( - intptr_t handle, int mode, int bnOps, - size_t xDesc, - size_t yDesc, - size_t dyDesc, - size_t dzDesc, - size_t dxDesc, - size_t dBnScaleBiasDesc, - size_t activationDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationBackwardExWorkspaceSize( - handle, - mode, - bnOps, - xDesc, - yDesc, - dyDesc, - dzDesc, - dxDesc, - dBnScaleBiasDesc, - activationDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef size_t getBatchNormalizationTrainingExReserveSpaceSize( - intptr_t handle, int mode, int bnOps, - size_t activationDesc, - size_t xDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - handle, - mode, - bnOps, - activationDesc, - xDesc, - &sizeInBytes) - check_status(status) - return sizeInBytes - - -############################################################################### -# Activation -############################################################################### - -cpdef size_t createActivationDescriptor() except? 0: - cdef ActivationDescriptor activationDesc - status = cudnnCreateActivationDescriptor(&activationDesc) - check_status(status) - return activationDesc - - -cpdef setActivationDescriptor( - size_t activationDesc, int mode, int reluNanOpt, double reluCeiling): - status = cudnnSetActivationDescriptor( - activationDesc, mode, - reluNanOpt, reluCeiling) - check_status(status) - - -cpdef destroyActivationDescriptor(size_t activationDesc): - status = cudnnDestroyActivationDescriptor( - activationDesc) - check_status(status) - - -cpdef softmaxForward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnSoftmaxForward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - beta, dstDesc, dstData) - check_status(status) - - -cpdef softmaxBackward( - intptr_t handle, int algorithm, int mode, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, size_t beta, - size_t destDiffDesc, size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnSoftmaxBackward( - handle, algorithm, mode, - alpha, srcDesc, srcData, - srcDiffDesc, srcDiffData, beta, - destDiffDesc, destDiffData) - check_status(status) - - -cpdef activationForward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t beta, size_t dstDesc, size_t dstData): - _setStream(handle) - with nogil: - status = cudnnActivationForward_v4( - handle, activationDesc, alpha, - srcDesc, srcData, beta, - dstDesc, dstData) - check_status(status) - - -cpdef activationBackward_v4( - intptr_t handle, size_t activationDesc, size_t alpha, size_t srcDesc, - size_t srcData, size_t srcDiffDesc, size_t srcDiffData, - size_t destDesc, size_t destData, size_t beta, size_t destDiffDesc, - size_t destDiffData): - _setStream(handle) - with nogil: - status = cudnnActivationBackward_v4( - handle, activationDesc, alpha, - srcDesc, srcData, - srcDiffDesc, srcDiffData, - destDesc, destData, beta, - destDiffDesc, destDiffData) - check_status(status) - - -############################################################################### -# Dropout -############################################################################### - -cpdef size_t createDropoutDescriptor() except? 0: - cdef DropoutDescriptor desc - status = cudnnCreateDropoutDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyDropoutDescriptor(size_t dropoutDesc): - status = cudnnDestroyDropoutDescriptor(dropoutDesc) - check_status(status) - - -cpdef Py_ssize_t dropoutGetStatesSize(intptr_t handle) except? -1: - cdef size_t sizeInBytes - status = cudnnDropoutGetStatesSize( - handle, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef setDropoutDescriptor( - size_t dropoutDesc, intptr_t handle, float dropout, - size_t states, size_t stateSizeInBytes, unsigned long long seed): - status = cudnnSetDropoutDescriptor( - dropoutDesc, handle, dropout, - states, stateSizeInBytes, seed) - check_status(status) - - -cpdef size_t getDropoutReserveSpaceSize(size_t xDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnDropoutGetReserveSpaceSize( - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef dropoutForward( - intptr_t handle, size_t dropoutDesc, - size_t srcDesc, size_t srcData, - size_t dstDesc, size_t dstData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnDropoutForward( - handle, dropoutDesc, - srcDesc, srcData, - dstDesc, dstData, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef dropoutBackward( - intptr_t handle, size_t dropoutDesc, - size_t dyDesc, size_t dyData, - size_t dxDesc, size_t dxData, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnDropoutBackward( - handle, dropoutDesc, - dyDesc, dyData, - dxDesc, dxData, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# CTC -############################################################################### -cpdef size_t createCTCLossDescriptor() except? 0: - cdef CTCLossDescriptor desc - status = cudnnCreateCTCLossDescriptor(&desc) - check_status(status) - return desc - -cpdef destroyCTCLossDescriptor(size_t ctcLossDesc): - status = cudnnDestroyCTCLossDescriptor(ctcLossDesc) - check_status(status) - -cpdef setCTCLossDescriptor(size_t ctcLossDesc, int dataType): - status = cudnnSetCTCLossDescriptor( - ctcLossDesc, dataType) - check_status(status) - -cpdef getCTCLossDescriptor(size_t ctcLossDesc): - cdef DataType compType - status = cudnnGetCTCLossDescriptor( - ctcLossDesc, &compType) - check_status(status) - return compType - -cpdef size_t getCTCLossWorkspaceSize( - intptr_t handle, size_t probsDesc, size_t gradientsDesc, - size_t labels, size_t labelLengths, size_t inputLengths, - int algo, size_t ctcLossDesc) except? 0: - cdef size_t sizeInBytes - status = cudnnGetCTCLossWorkspaceSize( - handle, probsDesc, - gradientsDesc, - labels, labelLengths, inputLengths, - algo, ctcLossDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - -cpdef CTCLoss( - intptr_t handle, size_t probsDesc, - size_t probs, size_t labels, size_t labelLengths, size_t inputLengths, - size_t costs, size_t gradientsDesc, size_t gradients, - int algo, size_t ctcLossDesc, - size_t workspace, size_t workSpaceSizeInBytes): - status = cudnnCTCLoss( - handle, probsDesc, probs, - labels, labelLengths, inputLengths, - costs, gradientsDesc, gradients, - algo, ctcLossDesc, - workspace, workSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# RNN -############################################################################### - -cpdef size_t createRNNDescriptor() except? 0: - cdef RNNDescriptor desc - status = cudnnCreateRNNDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyRNNDescriptor(size_t rnnDesc): - status = cudnnDestroyRNNDescriptor(rnnDesc) - check_status(status) - - -cpdef size_t createPersistentRNNPlan(size_t rnnDesc, int minibatch, - int dataType) except? 0: - cdef PersistentRNNPlan plan - status = cudnnCreatePersistentRNNPlan( - rnnDesc, - minibatch, dataType, &plan) - check_status(status) - return plan - - -cpdef setPersistentRNNPlan(size_t rnnDesc, size_t plan): - status = cudnnSetPersistentRNNPlan( - rnnDesc, plan) - check_status(status) - - -cpdef destroyPersistentRNNPlan(size_t plan): - status = cudnnDestroyPersistentRNNPlan(plan) - check_status(status) - - -cpdef setRNNDescriptor_v5( - size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int dataType): - status = cudnnSetRNNDescriptor_v5( - rnnDesc, hiddenSize, numLayers, - dropoutDesc, inputMode, - direction, mode, dataType) - check_status(status) - - -cpdef setRNNDescriptor_v6( - intptr_t handle, size_t rnnDesc, int hiddenSize, int numLayers, - size_t dropoutDesc, int inputMode, int direction, int mode, - int algo, int dataType): - status = cudnnSetRNNDescriptor_v6( - handle, rnnDesc, hiddenSize, numLayers, - dropoutDesc, inputMode, - direction, mode, algo, - dataType) - check_status(status) - - -cpdef setRNNPaddingMode( - size_t rnnDesc, int paddingMode): - status = cudnnSetRNNPaddingMode( - rnnDesc, paddingMode) - check_status(status) - - -cpdef getRNNPaddingMode(size_t rnnDesc): - cdef RNNPaddingMode paddingMode - status = cudnnGetRNNPaddingMode( - rnnDesc, &paddingMode) - check_status(status) - return paddingMode - - -cpdef size_t createRNNDataDescriptor() except? 0: - cdef RNNDataDescriptor desc - status = cudnnCreateRNNDataDescriptor(&desc) - check_status(status) - return desc - - -cpdef destroyRNNDataDescriptor(size_t RNNDataDesc): - status = cudnnDestroyRNNDataDescriptor(RNNDataDesc) - check_status(status) - - -cpdef setRNNDataDescriptor( - size_t RNNDataDesc, int dataType, size_t layout, - int maxSeqLength, int batchSize, int vectorSize, - size_t seqLengthArray, size_t paddingFill): - status = cudnnSetRNNDataDescriptor( - RNNDataDesc, dataType, - layout, maxSeqLength, batchSize, vectorSize, - seqLengthArray, paddingFill) - check_status(status) - - -cpdef getRNNDataDescriptor( - size_t RNNDataDesc, size_t dataType, - size_t layout, size_t maxSeqLength, size_t batchSize, - size_t vectorSize, int arrayLengthRequested, size_t seqLengthArray, - size_t paddingFill): - status = cudnnGetRNNDataDescriptor( - RNNDataDesc, dataType, - layout, maxSeqLength, batchSize, - vectorSize, arrayLengthRequested, seqLengthArray, - paddingFill) - check_status(status) - - -cpdef getRNNWorkspaceSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): - cdef size_t sizeInBytes - status = cudnnGetRNNWorkspaceSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef getRNNTrainingReserveSize( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc): - cdef size_t sizeInBytes - status = cudnnGetRNNTrainingReserveSize( - handle, rnnDesc, seqLength, - xDesc, &sizeInBytes) - check_status(status) - return sizeInBytes - - -cpdef getRNNParamsSize( - intptr_t handle, size_t rnnDesc, size_t xDesc, int dataType): - cdef size_t sizeInBytes - status = cudnnGetRNNParamsSize( - handle, rnnDesc, xDesc, - &sizeInBytes, dataType) - check_status(status) - return sizeInBytes - - -cpdef getRNNLinLayerMatrixParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerMatDesc, size_t linLayerMat): - status = cudnnGetRNNLinLayerMatrixParams( - handle, rnnDesc, layer, - xDesc, wDesc, w, - linLayerID, linLayerMatDesc, linLayerMat) - check_status(status) - - -cpdef getRNNLinLayerBiasParams( - intptr_t handle, size_t rnnDesc, int layer, size_t xDesc, size_t wDesc, - size_t w, int linLayerID, size_t linLayerBiasDesc, - size_t linLayerBias): - status = cudnnGetRNNLinLayerBiasParams( - handle, rnnDesc, layer, - xDesc, wDesc, w, - linLayerID, linLayerBiasDesc, linLayerBias) - check_status(status) - - -cpdef RNNForwardInference( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, - size_t x, size_t hxDesc, size_t hx, size_t cxDesc, - size_t cx, size_t wDesc, size_t w, size_t yDesc, - size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t workspace, size_t workSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardInference( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardTraining( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t wDesc, size_t w, size_t yDesc, size_t y, - size_t hyDesc, size_t hy, size_t cyDesc, size_t cy, - size_t workspace, size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardTraining( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardData( - intptr_t handle, size_t rnnDesc, int seqLength, size_t yDesc, size_t y, - size_t dyDesc, size_t dy, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, - size_t hxDesc, size_t hx, size_t cxDesc, size_t cx, - size_t dxDesc, size_t dx, size_t dhxDesc, size_t dhx, - size_t dcxDesc, size_t dcx, size_t workspace, - size_t workSpaceSizeInBytes, size_t reserveSpace, - size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardData( - handle, rnnDesc, seqLength, - yDesc, y, - dyDesc, dy, - dhyDesc, dhy, - dcyDesc, dcy, - wDesc, w, - hxDesc, hx, - cxDesc, cx, - dxDesc, dx, - dhxDesc, dhx, - dcxDesc, dcx, - workspace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardWeights( - intptr_t handle, size_t rnnDesc, int seqLength, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workspace, size_t workSpaceSizeInBytes, size_t dwDesc, - size_t dw, size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardWeights( - handle, rnnDesc, seqLength, - xDesc, x, - hxDesc, hx, - yDesc, y, - workspace, workSpaceSizeInBytes, - dwDesc, dw, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardInferenceEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardInferenceEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - kDesc, keys, - cDesc, cAttn, - iDesc, iAttn, - qDesc, queries, - workSpace, workSpaceSizeInBytes) - check_status(status) - - -cpdef RNNForwardTrainingEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t wDesc, size_t w, - size_t yDesc, size_t y, size_t hyDesc, size_t hy, size_t cyDesc, - size_t cy, size_t kDesc, size_t keys, size_t cDesc, size_t cAttn, - size_t iDesc, size_t iAttn, size_t qDesc, size_t queries, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNForwardTrainingEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - cxDesc, cx, - wDesc, w, - yDesc, y, - hyDesc, hy, - cyDesc, cy, - kDesc, keys, - cDesc, cAttn, - iDesc, iAttn, - qDesc, queries, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardDataEx( - intptr_t handle, size_t rnnDesc, size_t yDesc, size_t y, size_t dyDesc, - size_t dy, size_t dcDesc, size_t dcAttn, size_t dhyDesc, size_t dhy, - size_t dcyDesc, size_t dcy, size_t wDesc, size_t w, size_t hxDesc, - size_t hx, size_t cxDesc, size_t cx, size_t dxDesc, size_t dx, - size_t dhxDesc, size_t dhx, size_t dcxDesc, size_t dcx, - size_t dkDesc, size_t dkeys, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardDataEx( - handle, rnnDesc, - yDesc, y, - dyDesc, dy, - dcDesc, dcAttn, - dhyDesc, dhy, - dcyDesc, dcy, - wDesc, w, - hxDesc, hx, - cxDesc, cx, - dxDesc, dx, - dhxDesc, dhx, - dcxDesc, dcx, - dkDesc, dkeys, - workSpace, workSpaceSizeInBytes, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -cpdef RNNBackwardWeightsEx( - intptr_t handle, size_t rnnDesc, size_t xDesc, size_t x, - size_t hxDesc, size_t hx, size_t yDesc, size_t y, - size_t workSpace, size_t workSpaceSizeInBytes, - size_t dwDesc, size_t dw, - size_t reserveSpace, size_t reserveSpaceSizeInBytes): - _setStream(handle) - with nogil: - status = cudnnRNNBackwardWeightsEx( - handle, rnnDesc, - xDesc, x, - hxDesc, hx, - yDesc, y, - workSpace, workSpaceSizeInBytes, - dwDesc, dw, - reserveSpace, reserveSpaceSizeInBytes) - check_status(status) - - -############################################################################### -# Spatial Transformer -############################################################################### - -cpdef size_t createSpatialTransformerDescriptor() except? 0: - cdef SpatialTransformerDescriptor stDesc - status = cudnnCreateSpatialTransformerDescriptor(&stDesc) - check_status(status) - return stDesc - - -cpdef destroySpatialTransformerDescriptor(size_t stDesc): - status = cudnnDestroySpatialTransformerDescriptor( - stDesc) - check_status(status) - - -cpdef setSpatialTransformerDescriptor( - size_t stDesc, size_t samplerType, int dataType, - int nbDims, size_t dimA): - status = cudnnSetSpatialTransformerNdDescriptor( - stDesc, samplerType, - dataType, nbDims, dimA) - check_status(status) - - -cpdef spatialTfGridGeneratorForward( - intptr_t handle, size_t stDesc, size_t theta, size_t grid): - _setStream(handle) - with nogil: - status = cudnnSpatialTfGridGeneratorForward( - handle, stDesc, - theta, grid) - check_status(status) - - -cpdef spatialTfGridGeneratorBackward( - intptr_t handle, size_t stDesc, size_t dgrid, size_t dtheta): - _setStream(handle) - with nogil: - status = cudnnSpatialTfGridGeneratorBackward( - handle, stDesc, - dgrid, dtheta) - check_status(status) - - -cpdef spatialTfSamplerForward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t grid, size_t beta, size_t yDesc, size_t y): - _setStream(handle) - with nogil: - status = cudnnSpatialTfSamplerForward( - handle, stDesc, - alpha, xDesc, x, grid, - beta, yDesc, y) - check_status(status) - - -cpdef spatialTfSamplerBackward( - intptr_t handle, size_t stDesc, size_t alpha, size_t xDesc, - size_t x, size_t beta, size_t dxDesc, size_t dx, size_t alphaDgrid, - size_t dyDesc, size_t dy, size_t grid, size_t betaDgrid, size_t dgrid): - _setStream(handle) - with nogil: - status = cudnnSpatialTfSamplerBackward( - handle, stDesc, - alpha, xDesc, x, beta, - dxDesc, dx, alphaDgrid, - dyDesc, dy, grid, - betaDgrid, dgrid) - check_status(status) - -############################################################################### -# Fused Ops -############################################################################### - -cpdef createFusedOpsConstParamPack(int ops): - cdef FusedOpsConstParamPack constPack - with nogil: - status = cudnnCreateFusedOpsConstParamPack(&constPack, ops) - check_status(status) - return constPack - -cpdef destroyFusedOpsConstParamPack(size_t constPack): - with nogil: - status = cudnnDestroyFusedOpsConstParamPack( - constPack) - check_status(status) - -cpdef setFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param): - with nogil: - status = cudnnSetFusedOpsConstParamPackAttribute( - constPack, - paramLabel, param) - check_status(status) - -cpdef getFusedOpsConstParamPackAttribute(size_t constPack, int paramLabel, - size_t param): - cdef int isNULL = 0 - with nogil: - status = cudnnGetFusedOpsConstParamPackAttribute( - constPack, - paramLabel, param, &isNULL) - check_status(status) - return isNULL - -cpdef createFusedOpsVariantParamPack(int ops): - cdef FusedOpsVariantParamPack varPack - with nogil: - status = cudnnCreateFusedOpsVariantParamPack(&varPack, ops) - check_status(status) - return varPack - -cpdef destroyFusedOpsVariantParamPack(size_t varPack): - with nogil: - status = cudnnDestroyFusedOpsVariantParamPack( - varPack) - check_status(status) - -cpdef setFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr): - with nogil: - status = cudnnSetFusedOpsVariantParamPackAttribute( - varPack, - paramLabel, ptr) - check_status(status) - -cpdef getFusedOpsVariantParamPackAttribute(size_t varPack, int paramLabel, - size_t ptr): - with nogil: - status = cudnnGetFusedOpsVariantParamPackAttribute( - varPack, - paramLabel, ptr) - check_status(status) - -cpdef createFusedOpsPlan(int ops): - cdef FusedOpsPlan plan - with nogil: - status = cudnnCreateFusedOpsPlan(&plan, ops) - check_status(status) - return plan - -cpdef destroyFusedOpsPlan(size_t plan): - with nogil: - status = cudnnDestroyFusedOpsPlan(plan) - check_status(status) - -cpdef makeFusedOpsPlan(intptr_t handle, size_t plan, size_t constPack): - cdef size_t workspaceSizeInBytes - _setStream(handle) - with nogil: - status = cudnnMakeFusedOpsPlan(handle, plan, - constPack, - &workspaceSizeInBytes) - check_status(status) - return workspaceSizeInBytes - -cpdef fusedOpsExecute(intptr_t handle, size_t plan, size_t varPack): - _setStream(handle) - with nogil: - status = cudnnFusedOpsExecute(handle, plan, - varPack) - check_status(status) +""" diff --git a/cupy_backends/cupy_cudnn.h b/cupy_backends/cupy_cudnn.h index a514f63d200..25f7e829f71 100644 --- a/cupy_backends/cupy_cudnn.h +++ b/cupy_backends/cupy_cudnn.h @@ -2,8 +2,11 @@ #ifndef INCLUDE_GUARD_CUPY_CUDNN_H #define INCLUDE_GUARD_CUPY_CUDNN_H +#if CUPY_USE_HIP -#ifndef CUPY_NO_CUDA +#include "miopen/miopen.h" + +#elif !defined(CUPY_NO_CUDA) #include @@ -12,13 +15,8 @@ #include "stub/cupy_cuda_common.h" #include "stub/cupy_cudnn.h" -#else - -#include "hip/cupy_hip_common.h" -#include "stub/cupy_cudnn.h" - -#endif // #ifdef CUPY_NO_CUDA +#endif // #ifdef CUPY_USE_HIP /////////////////////////////////////////////////////////////////////////////// // Definitions are for compatibility with cuDNN v5 and v6. @@ -26,7 +24,7 @@ extern "C" { -#if defined(CUPY_NO_CUDA) || (CUDNN_VERSION < 6000) +#if (defined(CUPY_NO_CUDA) || (CUDNN_VERSION < 6000)) && !CUPY_USE_HIP typedef enum {} cudnnRNNAlgo_t; typedef enum {} cudnnReduceTensorOp_t; @@ -85,7 +83,7 @@ cudnnStatus_t cudnnReduceTensor(...) { #endif // #if defined(CUPY_NO_CUDA) || (CUDNN_VERSION < 6000) -#if !defined(CUPY_NO_CUDA) +#if !defined(CUPY_NO_CUDA) && !CUPY_USE_HIP // Some functions are renamed in cuDNN v5. // Following definitions are for compatibility with cuDNN v5 and higher. @@ -96,14 +94,14 @@ cudnnStatus_t cudnnReduceTensor(...) { #endif // #if !defined(CUPY_NO_CUDA) -#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 6000) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 6000) && !CUPY_USE_HIP #define cudnnSetConvolution2dDescriptor_v4 cudnnSetConvolution2dDescriptor #endif // #if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 6000) -#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION >= 6000) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION >= 6000) && !CUPY_USE_HIP // Some functions are renamed in cuDNN v6. // Following definitions are for compatibility with cuDNN v6 and higher. @@ -118,7 +116,7 @@ cudnnStatus_t cudnnReduceTensor(...) { #endif // #if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION >= 6000) -#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7000) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7000) && !CUPY_USE_HIP #define cudnnSetRNNDescriptor_v5 cudnnSetRNNDescriptor @@ -189,14 +187,14 @@ cudnnStatus_t cudnnSetConvolution2dDescriptor_v4(...) { #endif // #if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION >= 7000) -#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 6000) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 6000) && !CUPY_USE_HIP typedef enum {} cudnnDeterminism_t; #endif // #if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 6000) -#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7000) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7000) && !CUPY_USE_HIP cudnnStatus_t cudnnFindConvolutionForwardAlgorithmEx_v7(...) { return CUDNN_STATUS_NOT_SUPPORTED; @@ -254,7 +252,7 @@ typedef struct { #endif // #if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION >= 7000) -#if defined(CUPY_NO_CUDA) || (CUDNN_VERSION < 7200) +#if defined(CUPY_NO_CUDA) || (CUDNN_VERSION < 7200) && !CUPY_USE_HIP typedef void* cudnnRNNDataDescriptor_t; @@ -303,7 +301,7 @@ cudnnStatus_t cudnnRNNBackwardWeightsEx(...) { #endif // defined(CUPY_NO_CUDA) || (CUDNN_VERSION < 7200) -#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 8000) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 8000) && !CUPY_USE_HIP // TODO: check function names when cuDNN 8 is released. #define cudnnGetConvolutionForwardAlgorithm_v6 cudnnGetConvolutionForwardAlgorithm @@ -313,7 +311,7 @@ cudnnStatus_t cudnnRNNBackwardWeightsEx(...) { #endif // #if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 8000) -#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7000) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7000) && !CUPY_USE_HIP typedef enum {} cudnnErrQueryMode_t; typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t; @@ -324,7 +322,7 @@ cudnnStatus_t cudnnQueryRuntimeError(...) { #endif // !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7000) -#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7400) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7400) && !CUPY_USE_HIP typedef enum {} cudnnBatchNormOps_t; @@ -350,7 +348,7 @@ cudnnStatus_t cudnnGetBatchNormalizationTrainingExReserveSpaceSize(...) { #endif // !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7400) -#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7600) +#if !defined(CUPY_NO_CUDA) && (CUDNN_VERSION < 7600) && !CUPY_USE_HIP // fused ops typedef void* cudnnFusedOpsConstParamPack_t; diff --git a/cupyx/cudnn.pyx b/cupyx/cudnn.pyx index fcfb9e98c10..5894c18cdd7 100644 --- a/cupyx/cudnn.pyx +++ b/cupyx/cudnn.pyx @@ -128,7 +128,6 @@ cpdef int _get_byte_size(int data_type) except -1: else: raise TypeError('Invalid cuDNN data type: {}'.format(data_type)) - cpdef _create_tensor_nd_descriptor( size_t desc, _ndarray_base arr, int data_type=-1): cdef vector.vector[int] c_shape, c_strides @@ -183,7 +182,6 @@ cpdef _create_tensor_descriptor_as4darray(size_t desc, cudnn.setTensor4dDescriptor(desc, cudnn.CUDNN_TENSOR_NCHW, data_type, dim1, dim2, 1, 1) - cpdef _create_filter_descriptor( size_t desc, _ndarray_base arr, int format=cudnn.CUDNN_TENSOR_NCHW): cdef vector.vector[int] c_shape @@ -255,19 +253,6 @@ cpdef _create_convolution_descriptor( elif groups > 1: raise ValueError('groups must be one when cuDNN < 7.0') - -cpdef _ndarray_base _ascontiguousarray_normalized_strides(_ndarray_base a): - cdef _ndarray_base newarray - - if a._c_contiguous: - newarray = a.view() - newarray._set_contiguous_strides(newarray.itemsize, True) - else: - newarray = _core.ndarray(a.shape, a.dtype) - _elementwise_copy(a, newarray) - return newarray - - def create_tensor_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW): desc = Descriptor(cudnn.createTensorDescriptor(), _py_cudnn.destroyTensorDescriptor) @@ -325,297 +310,6 @@ def create_convolution_descriptor(pad, stride, dtype, dtype, mode, use_tensor_core) return desc - -cdef _create_pooling_descriptor( - size_t desc, tuple ksize, tuple stride, tuple pad, int mode): - cdef vector.vector[int] c_ksize, c_pad, c_stride - cdef int ndim = len(ksize) - if ndim != len(stride) or ndim != len(pad): - raise ValueError('ksize, stride, and pad must be of same length') - if ndim == 2: - cudnn.setPooling2dDescriptor_v4( - desc, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ksize[0], - ksize[1], pad[0], pad[1], stride[0], stride[1]) - else: - c_ksize = ksize - c_pad = pad - c_stride = stride - cudnn.setPoolingNdDescriptor_v4( - desc, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ndim, - c_ksize.data(), c_pad.data(), - c_stride.data()) - - return desc - - -def create_pooling_descriptor(ksize, stride, pad, int mode): - desc = Descriptor(cudnn.createPoolingDescriptor(), - _py_cudnn.destroyPoolingDescriptor) - _create_pooling_descriptor(desc.value, ksize, stride, pad, mode) - return desc - - -cdef Descriptor _create_rnn_data_descriptor(): - return Descriptor(cudnn.createRNNDataDescriptor(), - _py_cudnn.destroyRNNDataDescriptor) - - -cdef Descriptor _make_unpacked_rnn_data_descriptor(_ndarray_base xs, lengths): - cdef Descriptor descriptor = _create_rnn_data_descriptor() - cdef int data_type = get_data_type(xs.dtype) - cdef Py_ssize_t max_length, batch, n_dim - max_length, batch, n_dim = xs.shape - cudnn.setRNNDataDescriptor( - descriptor.value, data_type, - cudnn.CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, - max_length, batch, n_dim, - lengths.ctypes.data, 0) - return descriptor - - -def rnn_forward_inference_ex( - DropoutStates states, int direction_mode, int rnn_mode, - _ndarray_base hx, _ndarray_base cx, _ndarray_base w, - _ndarray_base xs, lengths): - hx = core._internal_ascontiguousarray(hx) - if cx is not None: - cx = core._internal_ascontiguousarray(cx) - w = core._internal_ascontiguousarray(w) - xs = core._internal_ascontiguousarray(xs) - - cdef int length = xs._shape[0] - cdef int n_layers = _get_n_layers(direction_mode, hx) - cdef int n_units = hx._shape[2] - - cdef _ndarray_base ys = _make_rnn_result_array(direction_mode, n_units, xs) - cdef _ndarray_base hy = _core.ndarray(hx.shape, hx.dtype) - if cx is None: - cx = _core.ndarray(0, dtype=xs.dtype) - cdef _ndarray_base cy = _core.ndarray(cx.shape, cx.dtype) - - cdef size_t handle = get_handle() - - cdef Descriptor rnn_desc = create_rnn_descriptor( - n_units, n_layers, states._desc, - cudnn.CUDNN_LINEAR_INPUT, direction_mode, - rnn_mode, get_data_type(xs.dtype)) - cudnn.setRNNPaddingMode( - rnn_desc.value, cudnn.CUDNN_RNN_PADDED_IO_ENABLED) - - cdef Descriptor x_data_desc = _make_unpacked_rnn_data_descriptor( - xs, lengths) - cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) - cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) - cdef Descriptor w_desc = create_filter_descriptor(w) - cdef Descriptor y_data_desc = _make_unpacked_rnn_data_descriptor( - ys, lengths) - cdef Descriptor hy_desc = create_tensor_nd_descriptor(hy) - cdef Descriptor cy_desc = create_tensor_nd_descriptor(cy) - - cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array_for_padded( - xs) - cdef _memory.MemoryPointer workspace = _make_rnn_workspace( - rnn_desc, length, xs_descs) - - cudnn.RNNForwardInferenceEx( - handle, rnn_desc.value, - x_data_desc.value, xs.data.ptr, - hx_desc.value, hx.data.ptr, - cx_desc.value, cx.data.ptr, - w_desc.value, w.data.ptr, - y_data_desc.value, ys.data.ptr, - hy_desc.value, hy.data.ptr, - cy_desc.value, cy.data.ptr, - 0, 0, 0, 0, 0, 0, 0, 0, - workspace.ptr, workspace.mem.size) - - return hy, cy, ys - - -def rnn_forward_training_ex( - DropoutStates states, int direction_mode, int rnn_mode, - _ndarray_base hx, _ndarray_base cx, _ndarray_base w, _ndarray_base xs, - lengths): - hx = core._internal_ascontiguousarray(hx) - if cx is not None: - cx = core._internal_ascontiguousarray(cx) - w = core._internal_ascontiguousarray(w) - xs = core._internal_ascontiguousarray(xs) - - cdef int length = xs._shape[0] - cdef int n_layers = _get_n_layers(direction_mode, hx) - cdef int n_units = hx._shape[2] - - cdef size_t handle = get_handle() - - cdef Descriptor rnn_desc = create_rnn_descriptor( - n_units, n_layers, states._desc, - cudnn.CUDNN_LINEAR_INPUT, direction_mode, - rnn_mode, get_data_type(xs.dtype)) - cudnn.setRNNPaddingMode( - rnn_desc.value, cudnn.CUDNN_RNN_PADDED_IO_ENABLED) - - cdef _ndarray_base ys = _make_rnn_result_array(direction_mode, n_units, xs) - cdef _ndarray_base hy = _core.ndarray(hx.shape, hx.dtype) - if cx is None: - cx = _core.ndarray(0, dtype=xs.dtype) - cdef _ndarray_base cy = _core.ndarray(cx.shape, cx.dtype) - - cdef Descriptor x_data_desc = _make_unpacked_rnn_data_descriptor( - xs, lengths) - cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) - cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) - cdef Descriptor w_desc = create_filter_descriptor(w) - cdef Descriptor y_data_desc = _make_unpacked_rnn_data_descriptor( - ys, lengths) - cdef Descriptor hy_desc = create_tensor_nd_descriptor(hy) - cdef Descriptor cy_desc = create_tensor_nd_descriptor(cy) - - cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array_for_padded( - xs) - cdef _memory.MemoryPointer workspace = _make_rnn_workspace( - rnn_desc, length, xs_descs) - cdef _memory.MemoryPointer reserve_space = _make_rnn_reserve_space( - rnn_desc, length, xs_descs) - - cudnn.RNNForwardTrainingEx( - handle, rnn_desc.value, - x_data_desc.value, xs.data.ptr, - hx_desc.value, hx.data.ptr, - cx_desc.value, cx.data.ptr, - w_desc.value, w.data.ptr, - y_data_desc.value, ys.data.ptr, - hy_desc.value, hy.data.ptr, - cy_desc.value, cy.data.ptr, - 0, 0, 0, 0, 0, 0, 0, 0, - workspace.ptr, workspace.mem.size, - reserve_space.ptr, reserve_space.mem.size) - - return reserve_space, hy, cy, ys - - -def rnn_backward_data_ex( - DropoutStates states, int direction_mode, int rnn_mode, - _ndarray_base hx, _ndarray_base cx, _ndarray_base w, _ndarray_base xs, - _ndarray_base ys, _memory.MemoryPointer reserve_space, - _ndarray_base dhy, _ndarray_base dcy, _ndarray_base dys, - lengths): - hx = core._internal_ascontiguousarray(hx) - if cx is not None: - cx = core._internal_ascontiguousarray(cx) - w = core._internal_ascontiguousarray(w) - xs = core._internal_ascontiguousarray(xs) - ys = core._internal_ascontiguousarray(ys) - dhy = core._internal_ascontiguousarray(dhy) - if dcy is not None: - dcy = core._internal_ascontiguousarray(dcy) - dys = core._internal_ascontiguousarray(dys) - - cdef int length = xs._shape[0] - cdef int n_layers = _get_n_layers(direction_mode, hx) - cdef int n_units = hx._shape[2] - - cdef size_t handle = get_handle() - cdef Descriptor rnn_desc = create_rnn_descriptor( - n_units, n_layers, states._desc, - cudnn.CUDNN_LINEAR_INPUT, direction_mode, - rnn_mode, get_data_type(xs.dtype)) - cudnn.setRNNPaddingMode( - rnn_desc.value, cudnn.CUDNN_RNN_PADDED_IO_ENABLED) - - cdef _ndarray_base dxs = _core.ndarray(xs.shape, xs.dtype) - cdef _ndarray_base dhx = _core.ndarray(hx.shape, hx.dtype) - if cx is None: - cx = dcy = _core.ndarray(0, dtype=xs.dtype) - cdef _ndarray_base dcx = _core.ndarray(cx.shape, cx.dtype) - - cdef Descriptor y_data_desc = _make_unpacked_rnn_data_descriptor( - ys, lengths) - cdef Descriptor dy_data_desc = _make_unpacked_rnn_data_descriptor( - dys, lengths) - cdef Descriptor dhy_desc = create_tensor_nd_descriptor(dhy) - cdef Descriptor dcy_desc = create_tensor_nd_descriptor(dcy) - cdef Descriptor w_desc = create_filter_descriptor(w) - cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) - cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) - cdef Descriptor dx_data_desc = _make_unpacked_rnn_data_descriptor( - dxs, lengths) - cdef Descriptor dhx_desc = create_tensor_nd_descriptor(dhx) - cdef Descriptor dcx_desc = create_tensor_nd_descriptor(dcx) - - cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array_for_padded( - xs) - cdef _memory.MemoryPointer workspace = _make_rnn_workspace( - rnn_desc, length, xs_descs) - - cudnn.RNNBackwardDataEx( - handle, rnn_desc.value, - y_data_desc.value, ys.data.ptr, - dy_data_desc.value, dys.data.ptr, - 0, 0, - dhy_desc.value, dhy.data.ptr, - dcy_desc.value, dcy.data.ptr, - w_desc.value, w.data.ptr, - hx_desc.value, hx.data.ptr, - cx_desc.value, cx.data.ptr, - dx_data_desc.value, dxs.data.ptr, - dhx_desc.value, dhx.data.ptr, - dcx_desc.value, dcx.data.ptr, - 0, 0, - workspace.ptr, workspace.mem.size, - reserve_space.ptr, reserve_space.mem.size) - - return dhx, dcx, dxs - - -def rnn_backward_weights_ex( - DropoutStates states, int direction_mode, int rnn_mode, - _ndarray_base xs, _ndarray_base hx, _ndarray_base ys, - _ndarray_base w, - _memory.MemoryPointer reserve_space, lengths): - xs = core._internal_ascontiguousarray(xs) - hx = core._internal_ascontiguousarray(hx) - ys = core._internal_ascontiguousarray(ys) - w = core._internal_ascontiguousarray(w) - - cdef int length = xs._shape[0] - cdef int n_layers = _get_n_layers(direction_mode, hx) - cdef int n_units = hx._shape[2] - - cdef size_t handle = get_handle() - cdef Descriptor rnn_desc = create_rnn_descriptor( - n_units, n_layers, states._desc, - cudnn.CUDNN_LINEAR_INPUT, direction_mode, - rnn_mode, get_data_type(xs.dtype)) - cudnn.setRNNPaddingMode( - rnn_desc.value, cudnn.CUDNN_RNN_PADDED_IO_ENABLED) - - cdef Descriptor x_data_desc = _make_unpacked_rnn_data_descriptor( - xs, lengths) - cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) - cdef Descriptor y_data_desc = _make_unpacked_rnn_data_descriptor( - ys, lengths) - - cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array_for_padded( - xs) - cdef _memory.MemoryPointer workspace = _make_rnn_workspace( - rnn_desc, length, xs_descs) - - cdef _ndarray_base dw = _core.ndarray(w.shape, w.dtype) - dw.fill(0) - cdef Descriptor dw_desc = create_filter_descriptor(dw) - - cudnn.RNNBackwardWeightsEx( - handle, rnn_desc.value, - x_data_desc.value, xs.data.ptr, - hx_desc.value, hx.data.ptr, - y_data_desc.value, ys.data.ptr, - workspace.ptr, workspace.mem.size, - dw_desc.value, dw.data.ptr, - reserve_space.ptr, reserve_space.mem.size) - return dw - - def create_activation_descriptor(mode, nan_prop_mode=cudnn.CUDNN_PROPAGATE_NAN, coef=0.0): desc = Descriptor(cudnn.createActivationDescriptor(), @@ -689,1925 +383,2454 @@ def activation_backward(_ndarray_base x, _ndarray_base y, _ndarray_base gy, cudnn.destroyTensorDescriptor(desc) return gx - -cdef int _create_tensor_descriptor_for_softmax( - size_t desc, _ndarray_base arr, int axis) except?-1: - cdef Py_ssize_t left, center, right - assert arr._c_contiguous - data_type = get_data_type(arr.dtype) - if axis < 0: - axis += arr._shape.size() - left = 1 - for i in range(0, axis): - left *= arr._shape[i] - center = arr._shape[axis] - right = 1 - for i in range(axis + 1, arr._shape.size()): - right *= arr._shape[i] - cudnn.setTensor4dDescriptor(desc, cudnn.CUDNN_TENSOR_NCHW, data_type, - left, center, right, 1) - if center == 1 and right == 1: - return cudnn.CUDNN_SOFTMAX_MODE_INSTANCE - else: - return cudnn.CUDNN_SOFTMAX_MODE_CHANNEL - - -def softmax_forward(_ndarray_base x, int axis, int algorithm): - cdef float float_zero = 0, float_one = 1 - cdef double double_zero = 0, double_one = 1 - cdef size_t zero, one - cdef _ndarray_base y - if x.dtype == 'd': - zero = &double_zero - one = &double_one - else: - zero = &float_zero - one = &float_one - - x = core._internal_ascontiguousarray(x) - y = _core.ndarray(x._shape, x.dtype) - - handle = get_handle() - desc = cudnn.createTensorDescriptor() - try: - cudnn_mode = _create_tensor_descriptor_for_softmax(desc, x, axis) - cudnn.softmaxForward( - handle, algorithm, cudnn_mode, - one, desc, x.data.ptr, zero, desc, y.data.ptr) - finally: - cudnn.destroyTensorDescriptor(desc) - return y - - -def softmax_backward( - _ndarray_base y, _ndarray_base gy, int axis, int algorithm): - cdef float float_zero = 0, float_one = 1 - cdef double double_zero = 0, double_one = 1 - cdef size_t zero, one - cdef _ndarray_base gx - if y.dtype == 'd': - zero = &double_zero - one = &double_one - else: - zero = &float_zero - one = &float_one - - gx = _core.ndarray(y._shape, y.dtype) - y = core._internal_ascontiguousarray(y) - gy = core._internal_ascontiguousarray(gy) - - handle = get_handle() - desc = cudnn.createTensorDescriptor() - try: - cudnn_mode = _create_tensor_descriptor_for_softmax(desc, y, axis) - cudnn.softmaxBackward( - handle, algorithm, cudnn_mode, - one, desc, y.data.ptr, desc, gy.data.ptr, zero, desc, gx.data.ptr) - finally: - cudnn.destroyTensorDescriptor(desc) - return gx - - -def create_dropout_descriptor( - handle, dropout, states, state_size_in_bytes, seed): - desc = Descriptor(cudnn.createDropoutDescriptor(), - _py_cudnn.destroyDropoutDescriptor) - cudnn.setDropoutDescriptor(desc.value, handle, dropout, - states, state_size_in_bytes, seed) - return desc - - -def set_dropout_descriptor(desc, handle, dropout): - # When the fourth argument is NULL, random state is not updated. - cudnn.setDropoutDescriptor(desc.value, handle, dropout, 0, 0, 0) - - -def _create_ctc_loss_descriptor(data_type): - desc = Descriptor(cudnn.createCTCLossDescriptor(), - _py_cudnn.destroyCTCLossDescriptor) - cudnn.setCTCLossDescriptor(desc.value, data_type) - return desc - - -def ctc_loss(_ndarray_base probs, labels, - label_length, input_length, int algo): - batch_size = probs.shape[1] - labels_ptr = labels.ctypes.data - label_length_ptr = label_length.ctypes.data - input_length_ptr = input_length.ctypes.data - handle = get_handle() - data_type = get_data_type(probs.dtype) - ctc_desc = Descriptor(cudnn.createCTCLossDescriptor(), - _py_cudnn.destroyCTCLossDescriptor) - cudnn.setCTCLossDescriptor(ctc_desc.value, data_type) - - gradients = _core.ndarray(probs._shape, probs.dtype) - loss = _core.ndarray((batch_size, ), 'f') - probs_desc = create_tensor_descriptor(probs) - gradients_desc = create_tensor_descriptor(gradients) - - work_size = cudnn.getCTCLossWorkspaceSize( - handle, probs_desc.value, gradients_desc.value, - labels_ptr, label_length_ptr, - input_length_ptr, algo, ctc_desc.value) - workspace = _core.ndarray((work_size,), 'b') - - cudnn.CTCLoss(handle, probs_desc.value, probs.data.ptr, - labels_ptr, label_length_ptr, - input_length_ptr, loss.data.ptr, gradients_desc.value, - gradients.data.ptr, algo, ctc_desc.value, - workspace.data.ptr, work_size) - return loss, gradients - - -def create_rnn_descriptor(hidden_size, num_layers, dropout_desc, - input_mode, direction, mode, data_type, algo=None): - desc = Descriptor(cudnn.createRNNDescriptor(), - _py_cudnn.destroyRNNDescriptor) - if cudnn_version() >= 6000: - _handle = get_handle() - if algo is None: - algo = cudnn.CUDNN_RNN_ALGO_STANDARD - cudnn.setRNNDescriptor_v6( - _handle, desc.value, hidden_size, num_layers, dropout_desc.value, - input_mode, direction, mode, algo, data_type) - else: - cudnn.setRNNDescriptor_v5( - desc.value, hidden_size, num_layers, dropout_desc.value, - input_mode, direction, mode, data_type) - return desc - - -def get_rnn_lin_layer_matrix_params( - handle, rnn_desc, layer, x_desc, w_desc, _ndarray_base w, - lin_layer_id): - cdef size_t ptr = 0 - mat_desc = cudnn.createFilterDescriptor() - try: - cudnn.getRNNLinLayerMatrixParams( - handle, rnn_desc.value, layer, x_desc.value, w_desc.value, - w.data.ptr, lin_layer_id, mat_desc, &ptr) - data_type, _, _, dim = cudnn.getFilterNdDescriptor(mat_desc, 3) - finally: - cudnn.destroyFilterDescriptor(mat_desc) - byte_size = _get_byte_size(data_type) - offset = (ptr - w.data.ptr) // byte_size - size = internal.prod_sequence(dim) - mat = w[offset:offset + size] - return mat - - -def get_rnn_lin_layer_bias_params( - handle, rnn_desc, layer, x_desc, w_desc, _ndarray_base w, - lin_layer_id): - cdef size_t ptr = 0 - bias_desc = cudnn.createFilterDescriptor() - try: - cudnn.getRNNLinLayerBiasParams( - handle, rnn_desc.value, layer, x_desc.value, w_desc.value, - w.data.ptr, lin_layer_id, bias_desc, &ptr) - data_type, _, _, dim = cudnn.getFilterNdDescriptor(bias_desc, 3) - finally: - cudnn.destroyFilterDescriptor(bias_desc) - byte_size = _get_byte_size(data_type) - offset = (ptr - w.data.ptr) // byte_size - size = internal.prod_sequence(dim) - bias = w[offset:offset + size] - return bias - - -cdef class _DescriptorArray: - - cdef: - vector.vector[size_t] _value - object _destroy - - def __init__(self, destroyer): - self._destroy = destroyer - - def __dealloc__(self): - for desc in self._value: - self._destroy(desc) - - def append(self, desc): - self._value.push_back(desc) - - @property - def data(self): - return self._value.data() - - -cdef _DescriptorArray _make_tensor_descriptor_array(xs, lengths): - """Make an array of pointers denoting pointers of tensor descriptors. - - """ - cdef _DescriptorArray descs = _DescriptorArray( - _py_cudnn.destroyTensorDescriptor) - cdef size_t desc - cdef int data_type = get_data_type(xs.dtype) - cdef vector.vector[int] c_shape, c_strides - cdef Py_ssize_t itemsize = xs.itemsize - cdef Py_ssize_t s - cdef int length - - # RNN APIs assumes ndim == 3. - for s in xs._strides: - c_strides.push_back(s // itemsize) - for _ in range(3 - len(xs._strides)): - c_strides.push_back(1) - for s in xs._shape: - c_shape.push_back(s) - for _ in range(3 - len(xs._strides)): - c_shape.push_back(1) - - for length in lengths: - c_shape[0] = length - desc = cudnn.createTensorDescriptor() - descs.append(desc) - cudnn.setTensorNdDescriptor( - desc, data_type, 3, - c_shape.data(), c_strides.data()) - - return descs - - -cdef _DescriptorArray _make_tensor_descriptor_array_for_padded(xs): - assert xs.ndim == 3 - - cdef _DescriptorArray descs = _DescriptorArray( - _py_cudnn.destroyTensorDescriptor) - cdef size_t desc - cdef int data_type = get_data_type(xs.dtype) - cdef Py_ssize_t itemsize = xs.itemsize - - # RNN APIs assumes ndim == 3. - cdef vector.vector[int] c_shape = [xs._shape[1], xs._shape[2], 1] - cdef vector.vector[int] c_strides = [ - xs._strides[1] // itemsize, xs._strides[2] // itemsize, 1] - - for _ in range(xs._shape[0]): - desc = cudnn.createTensorDescriptor() - descs.append(desc) +IF CUPY_HIP_VERSION == 0: + cpdef _create_tensor_nd_descriptor( + size_t desc, _ndarray_base arr, int data_type=-1): + cdef vector.vector[int] c_shape, c_strides + cdef Py_ssize_t itemsize, s + cdef int next_stride, i + if data_type == -1: # `-1` is used instead of `None` + data_type = get_data_type(arr.dtype) + itemsize = arr.itemsize + for s in arr._strides: + c_strides.push_back(s // itemsize) + for s in arr._shape: + c_shape.push_back(s) + # Use "c-contiguous stride" with the next axis, if ambiguous + next_stride = 1 + for i in reversed(range(c_shape.size())): + if c_shape[i] <= 1: + c_strides[i] = next_stride + else: + next_stride = c_shape[i] * c_strides[i] + cudnn.setTensorNdDescriptor( - desc, data_type, 3, - c_shape.data(), c_strides.data()) - - return descs - - -cdef _memory.MemoryPointer _make_rnn_workspace( - Descriptor rnn_desc, int length, _DescriptorArray descs): - cdef size_t handle = get_handle() - cdef size_t work_size = cudnn.getRNNWorkspaceSize( - handle, rnn_desc.value, length, descs.data) - return _memory.alloc(work_size) - - -cdef _memory.MemoryPointer _make_rnn_reserve_space( - Descriptor rnn_desc, int length, _DescriptorArray descs): - cdef size_t handle = get_handle() - cdef size_t reserve_size = cudnn.getRNNTrainingReserveSize( - handle, rnn_desc.value, length, descs.data) - return _memory.alloc(reserve_size) - - -cdef Py_ssize_t _get_n_layers(int direction_mode, _ndarray_base hx): - if direction_mode == cudnn.CUDNN_BIDIRECTIONAL: - return hx._shape[0] // 2 - else: # cudnn.CUDNN_UNIDIRECTIONAL - return hx._shape[0] - - -cdef _ndarray_base _make_rnn_result_array( - int direction_mode, Py_ssize_t n_units, _ndarray_base xs): - cdef int output_units - if direction_mode == cudnn.CUDNN_BIDIRECTIONAL: - output_units = n_units * 2 - else: # cudnn.CUDNN_UNIDIRECTIONAL - output_units = n_units - - shape = xs.shape[:-1] + (output_units,) - return _core.ndarray(shape, dtype=xs.dtype) - - -def rnn_forward_inference( - DropoutStates states, int direction_mode, int rnn_mode, - _ndarray_base hx, _ndarray_base cx, _ndarray_base w, _ndarray_base xs, - lengths): - hx = core._internal_ascontiguousarray(hx) - if cx is not None: - cx = core._internal_ascontiguousarray(cx) - w = core._internal_ascontiguousarray(w) - xs = core._internal_ascontiguousarray(xs) - - cdef int length = len(lengths) - cdef int n_layers = _get_n_layers(direction_mode, hx) - cdef int n_units = hx.shape[2] - - cdef size_t handle = get_handle() - - cdef Descriptor rnn_desc = create_rnn_descriptor( - n_units, n_layers, states._desc, - cudnn.CUDNN_LINEAR_INPUT, direction_mode, - rnn_mode, get_data_type(xs.dtype)) - - cdef _ndarray_base ys = _make_rnn_result_array(direction_mode, n_units, xs) - cdef _ndarray_base hy = _core.ndarray(hx.shape, hx.dtype) - if cx is None: - cx = _core.ndarray(0, dtype=xs.dtype) - cdef _ndarray_base cy = _core.ndarray(cx.shape, cx.dtype) - - cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array(xs, lengths) - cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) - cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) - cdef Descriptor w_desc = create_filter_descriptor(w) - cdef _DescriptorArray ys_descs = _make_tensor_descriptor_array(ys, lengths) - cdef Descriptor hy_desc = create_tensor_nd_descriptor(hy) - cdef Descriptor cy_desc = create_tensor_nd_descriptor(cy) - - cdef _memory.MemoryPointer workspace = _make_rnn_workspace( - rnn_desc, length, xs_descs) - - cudnn.RNNForwardInference( - handle, rnn_desc.value, length, - xs_descs.data, xs.data.ptr, hx_desc.value, hx.data.ptr, - cx_desc.value, cx.data.ptr, w_desc.value, w.data.ptr, - ys_descs.data, ys.data.ptr, hy_desc.value, hy.data.ptr, - cy_desc.value, cy.data.ptr, workspace.ptr, workspace.mem.size) - - return hy, cy, ys - - -def rnn_forward_training( - DropoutStates states, int direction_mode, int rnn_mode, - _ndarray_base hx, _ndarray_base cx, _ndarray_base w, _ndarray_base xs, - lengths): - hx = core._internal_ascontiguousarray(hx) - if cx is not None: - cx = core._internal_ascontiguousarray(cx) - w = core._internal_ascontiguousarray(w) - xs = core._internal_ascontiguousarray(xs) - - cdef int length = len(lengths) - cdef int n_layers = _get_n_layers(direction_mode, hx) - cdef int n_units = hx.shape[2] - - cdef size_t handle = get_handle() - - cdef Descriptor rnn_desc = create_rnn_descriptor( - n_units, n_layers, states._desc, - cudnn.CUDNN_LINEAR_INPUT, direction_mode, - rnn_mode, get_data_type(xs.dtype)) - - cdef _ndarray_base ys = _make_rnn_result_array(direction_mode, n_units, xs) - cdef _ndarray_base hy = _core.ndarray(hx.shape, hx.dtype) - if cx is None: - cx = _core.ndarray(0, dtype=xs.dtype) - cdef _ndarray_base cy = _core.ndarray(cx.shape, cx.dtype) - - cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array(xs, lengths) - cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) - cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) - cdef Descriptor w_desc = create_filter_descriptor(w) - cdef _DescriptorArray ys_descs = _make_tensor_descriptor_array(ys, lengths) - cdef Descriptor hy_desc = create_tensor_nd_descriptor(hy) - cdef Descriptor cy_desc = create_tensor_nd_descriptor(cy) - - cdef _memory.MemoryPointer workspace = _make_rnn_workspace( - rnn_desc, length, xs_descs) - cdef _memory.MemoryPointer reserve_space = _make_rnn_reserve_space( - rnn_desc, length, xs_descs) - - cudnn.RNNForwardTraining( - handle, rnn_desc.value, length, - xs_descs.data, xs.data.ptr, hx_desc.value, hx.data.ptr, - cx_desc.value, cx.data.ptr, w_desc.value, w.data.ptr, - ys_descs.data, ys.data.ptr, hy_desc.value, hy.data.ptr, - cy_desc.value, cy.data.ptr, workspace.ptr, workspace.mem.size, - reserve_space.ptr, reserve_space.mem.size) - - return reserve_space, hy, cy, ys - - -def rnn_backward_data( - DropoutStates states, int direction_mode, int rnn_mode, - _ndarray_base hx, _ndarray_base cx, _ndarray_base w, _ndarray_base xs, - _ndarray_base ys, _memory.MemoryPointer reserve_space, - _ndarray_base dhy, _ndarray_base dcy, _ndarray_base dys, - lengths): - hx = core._internal_ascontiguousarray(hx) - if cx is not None: - cx = core._internal_ascontiguousarray(cx) - w = core._internal_ascontiguousarray(w) - xs = core._internal_ascontiguousarray(xs) - ys = _ascontiguousarray_normalized_strides(ys) - dhy = core._internal_ascontiguousarray(dhy) - if dcy is not None: - dcy = core._internal_ascontiguousarray(dcy) - dys = _ascontiguousarray_normalized_strides(dys) - - cdef int length = len(lengths) - cdef int n_layers = _get_n_layers(direction_mode, hx) - cdef int n_units = hx.shape[2] - - cdef size_t handle = get_handle() - cdef Descriptor rnn_desc = create_rnn_descriptor( - n_units, n_layers, states._desc, - cudnn.CUDNN_LINEAR_INPUT, direction_mode, - rnn_mode, get_data_type(xs.dtype)) - - cdef _ndarray_base dxs = _core.ndarray(xs.shape, xs.dtype) - cdef _ndarray_base dhx = _core.ndarray(hx.shape, hx.dtype) - if cx is None: - cx = dcy = _core.ndarray(0, dtype=xs.dtype) - cdef _ndarray_base dcx = _core.ndarray(cx.shape, cx.dtype) - - cdef _DescriptorArray ys_descs = _make_tensor_descriptor_array(ys, lengths) - cdef _DescriptorArray dys_descs = _make_tensor_descriptor_array( - dys, lengths) - cdef Descriptor dhy_desc = create_tensor_nd_descriptor(dhy) - cdef Descriptor dcy_desc = create_tensor_nd_descriptor(dcy) - cdef Descriptor w_desc = create_filter_descriptor(w) - cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) - cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) - cdef _DescriptorArray dxs_descs = _make_tensor_descriptor_array( - dxs, lengths) - cdef Descriptor dhx_desc = create_tensor_nd_descriptor(dhx) - cdef Descriptor dcx_desc = create_tensor_nd_descriptor(dcx) - - cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array(xs, lengths) - cdef _memory.MemoryPointer workspace = _make_rnn_workspace( - rnn_desc, length, xs_descs) - - cudnn.RNNBackwardData( - handle, rnn_desc.value, length, - ys_descs.data, ys.data.ptr, - dys_descs.data, dys.data.ptr, dhy_desc.value, dhy.data.ptr, - dcy_desc.value, dcy.data.ptr, w_desc.value, w.data.ptr, - hx_desc.value, hx.data.ptr, cx_desc.value, cx.data.ptr, - dxs_descs.data, dxs.data.ptr, dhx_desc.value, dhx.data.ptr, - dcx_desc.value, dcx.data.ptr, workspace.ptr, workspace.mem.size, - reserve_space.ptr, reserve_space.mem.size) - - return dhx, dcx, dxs - - -def rnn_backward_weights( - DropoutStates states, int direction_mode, int rnn_mode, - _ndarray_base xs, _ndarray_base hx, _ndarray_base ys, - _ndarray_base w, - _memory.MemoryPointer reserve_space, lengths): - xs = core._internal_ascontiguousarray(xs) - hx = core._internal_ascontiguousarray(hx) - ys = core._internal_ascontiguousarray(ys) - w = core._internal_ascontiguousarray(w) - - cdef int length = len(lengths) - cdef int n_layers = _get_n_layers(direction_mode, hx) - cdef int n_units = hx.shape[2] - - cdef size_t handle = get_handle() - cdef Descriptor rnn_desc = create_rnn_descriptor( - n_units, n_layers, states._desc, - cudnn.CUDNN_LINEAR_INPUT, direction_mode, - rnn_mode, get_data_type(xs.dtype)) - - cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array(xs, lengths) - cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) - cdef _DescriptorArray ys_descs = _make_tensor_descriptor_array(ys, lengths) - - cdef _memory.MemoryPointer workspace = _make_rnn_workspace( - rnn_desc, length, xs_descs) - - cdef _ndarray_base dw = _core.ndarray(w.shape, w.dtype) - dw.fill(0) - cdef Descriptor dw_desc = create_filter_descriptor(dw) - - cudnn.RNNBackwardWeights( - handle, rnn_desc.value, length, - xs_descs.data, xs.data.ptr, - hx_desc.value, hx.data.ptr, ys_descs.data, ys.data.ptr, - workspace.ptr, workspace.mem.size, dw_desc.value, dw.data.ptr, - reserve_space.ptr, reserve_space.mem.size) - return dw - - -def create_dropout_states(handle): - _warnings.warn('create_dropout_states is deprecated.' - 'Please use DropoutStates class instead.', - DeprecationWarning) - state_size = cudnn.dropoutGetStatesSize(handle) - return _core.ndarray((state_size,), 'b') - - -def create_spatial_transformer_descriptor(sampler_type, dtype, nb_dims, dim_A): - desc = Descriptor(cudnn.createSpatialTransformerDescriptor(), - _py_cudnn.destroySpatialTransformerDescriptor) - data_type = get_data_type(dtype) - - cudnn.setSpatialTransformerDescriptor( - desc.value, sampler_type, data_type, nb_dims, dim_A) - return desc - - -def add_tensor(handle, alpha, biasDesc, biasData, beta, srcDestDesc, - srcDestData): - cudnn.addTensor_v3(handle, alpha, biasDesc, - biasData, beta, srcDestDesc, srcDestData) - - -def create_op_tensor_descriptor(op_type, dtype): - desc = Descriptor(cudnn.createOpTensorDescriptor(), - _py_cudnn.destroyOpTensorDescriptor) - data_type = get_data_type(dtype) - - cudnn.setOpTensorDescriptor(desc.value, op_type, data_type, - cudnn.CUDNN_NOT_PROPAGATE_NAN) - return desc - - -def create_reduce_tensor_descriptor(reduce_type, dtype): - desc = Descriptor(cudnn.createReduceTensorDescriptor(), - _py_cudnn.destroyReduceTensorDescriptor) - data_type = get_data_type(dtype) - if reduce_type in (cudnn.CUDNN_REDUCE_TENSOR_MIN, - cudnn.CUDNN_REDUCE_TENSOR_MAX): - indices = cudnn.CUDNN_REDUCE_TENSOR_FLATTENED_INDICES - else: - indices = cudnn.CUDNN_REDUCE_TENSOR_NO_INDICES - - cudnn.setReduceTensorDescriptor(desc.value, reduce_type, data_type, - cudnn.CUDNN_NOT_PROPAGATE_NAN, - indices, - cudnn.CUDNN_32BIT_INDICES) - return desc - - -cpdef bint is_tensor_core_available(dtype) except *: - return (cudnn_version() >= 7000 and - (dtype.char) == 'e' and - int(device.get_compute_capability()) == 70) - - -cdef class DropoutStates: - - cdef public: - # TODO(unno): Make these attributes private. This is for backward - # compatibility. - _memory.MemoryPointer _states - Descriptor _desc - - def __init__(self, handle, seed): - cdef size_t cudnn_handle - if handle is None: - cudnn_handle = get_handle() + desc, data_type, arr._shape.size(), c_shape.data(), + c_strides.data()) + + + cpdef _create_tensor_descriptor(size_t desc, _ndarray_base arr, + int format=cudnn.CUDNN_TENSOR_NCHW): + if not arr._c_contiguous: + raise ValueError('cupyx.cudnn supports c-contiguous arrays only') + if arr._shape.size() == 4: + data_type = get_data_type(arr.dtype) + if format == cudnn.CUDNN_TENSOR_NCHW: + n, c, h, w = arr._shape + elif format == cudnn.CUDNN_TENSOR_NHWC: + n, h, w, c = arr._shape + else: + raise ValueError('unknown cudnnTensorFormat: {}'.format(format)) + cudnn.setTensor4dDescriptor(desc, format, data_type, n, c, h, w) else: - cudnn_handle = handle - state_size = cudnn.dropoutGetStatesSize(cudnn_handle) - self._states = _memory.alloc(state_size) - self._desc = create_dropout_descriptor( - cudnn_handle, 0., self._states.ptr, - state_size, seed) - - def set_dropout_ratio(self, dropout_ratio): - cudnn_handle = get_handle() - set_dropout_descriptor(self._desc, cudnn_handle, dropout_ratio) - - def forward(self, handle, _ndarray_base x, dropout_ratio): - cdef _ndarray_base y, reserve_space - cdef size_t cudnn_handle - # This is for backward compatibility. - if handle is None: - cudnn_handle = get_handle() + _create_tensor_nd_descriptor(desc, arr) + + + cpdef _create_tensor_descriptor_as4darray(size_t desc, + _ndarray_base arr): + cdef Py_ssize_t dim1, dim2 + assert arr._c_contiguous + data_type = get_data_type(arr.dtype) + dim1 = 1 + if arr._shape.size() > 0: + dim1 = arr._shape[0] + dim2 = arr.size // dim1 + cudnn.setTensor4dDescriptor(desc, cudnn.CUDNN_TENSOR_NCHW, data_type, + dim1, dim2, 1, 1) + + + cpdef _create_filter_descriptor( + size_t desc, _ndarray_base arr, int format=cudnn.CUDNN_TENSOR_NCHW): + cdef vector.vector[int] c_shape + cdef Py_ssize_t s, ndim = arr._shape.size() + data_type = get_data_type(arr.dtype) + if ndim == 4: + if format == cudnn.CUDNN_TENSOR_NCHW: + k, c, h, w = arr._shape + elif format == cudnn.CUDNN_TENSOR_NHWC: + k, h, w, c = arr._shape + else: + raise ValueError('unknown cudnnTensorFormat: {}'.format(format)) + cudnn.setFilter4dDescriptor_v4( + desc, data_type, format, k, c, h, w) else: - cudnn_handle = handle - set_dropout_descriptor(self._desc, cudnn_handle, dropout_ratio) - + for s in arr._shape: + c_shape.push_back(s) + cudnn.setFilterNdDescriptor_v4( + desc, data_type, format, ndim, c_shape.data()) + + + cpdef _create_convolution_descriptor( + size_t desc, tuple pad, tuple stride, tuple dilation, int groups, + object dtype, int mode, bint use_tensor_core): + cdef int d0, d1, p0, p1, s0, s1 + cdef vector.vector[int] c_pad, c_stride, c_dilation + ndim = len(pad) + if ndim != len(stride): + raise ValueError('pad and stride must be of same length') + + compute_type = get_data_type(dtype) + # TODO(takagi) Temporarily use computing precision of FP32 for + # storing precision of FP16. + if compute_type == cudnn.CUDNN_DATA_HALF: + compute_type = cudnn.CUDNN_DATA_FLOAT + + if ndim != 2: + c_pad = pad + c_stride = stride + if dilation is None: + c_dilation.assign(ndim, 1) + else: + c_dilation = dilation + if cudnn_version() < 6000: + for i in c_dilation: + if i != 1: + raise ValueError( + 'dilation must be one when cuDNN < 6.0') + cudnn.setConvolutionNdDescriptor_v3( + desc, ndim, c_pad.data(), c_stride.data(), + c_dilation.data(), mode, compute_type) + else: + if dilation is None: + d0 = d1 = 1 + else: + d0, d1 = dilation + if cudnn_version() < 6000 and (d0 != 1 or d1 != 1): + raise ValueError('dilation must be one when cuDNN < 6.0') + p0, p1 = pad + s0, s1 = stride + cudnn.setConvolution2dDescriptor_v5( + desc, p0, p1, s0, s1, d0, d1, mode, compute_type) + IF CUPY_HIP_VERSION == 0: + if cudnn_version() >= 7000: + if use_tensor_core: + math_type = cudnn.CUDNN_TENSOR_OP_MATH + cudnn.setConvolutionMathType(desc, math_type) + if groups > 1: + cudnn.setConvolutionGroupCount(desc, groups) + elif groups > 1: + raise ValueError('groups must be one when cuDNN < 7.0') + + + cpdef _ndarray_base _ascontiguousarray_normalized_strides(_ndarray_base a): + cdef _ndarray_base newarray + + if a._c_contiguous: + newarray = a.view() + newarray._set_contiguous_strides(newarray.itemsize, True) + else: + newarray = _core.ndarray(a.shape, a.dtype) + _elementwise_copy(a, newarray) + return newarray + + + def create_tensor_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW): + desc = Descriptor(cudnn.createTensorDescriptor(), + _py_cudnn.destroyTensorDescriptor) + _create_tensor_descriptor(desc.value, arr, format) + return desc + + + def create_uninitialized_tensor_descriptor(): + """Create uninitialized tensor descriptor. + + Create a cudnnCreateTensorDescriptor_t that is not yet initialized. + This is used by the batch normalization functions. + """ + return Descriptor(cudnn.createTensorDescriptor(), + _py_cudnn.destroyTensorDescriptor) + + + def create_tensor_nd_descriptor(_ndarray_base arr): + cdef dict cache + if arr.size == 0: + return Descriptor(0, None) + if not arr.flags.c_contiguous: + raise ValueError('cupyx.cudnn supports c-contiguous arrays only') + data_type = get_data_type(arr.dtype) + key = (data_type, tuple(arr._shape)) + cache = _get_nd_tensor_cache() + if key in cache: + return cache[key] + + # numpy's stride is defined in bytes, but cudnn's stride is defined in + # size of element + desc = Descriptor(cudnn.createTensorDescriptor(), + _py_cudnn.destroyTensorDescriptor) + _create_tensor_nd_descriptor(desc.value, arr, data_type) + cache[key] = desc + return desc + + + def create_filter_descriptor(arr, format=cudnn.CUDNN_TENSOR_NCHW): + desc = Descriptor(cudnn.createFilterDescriptor(), + _py_cudnn.destroyFilterDescriptor) + _create_filter_descriptor(desc.value, arr, format) + return desc + + + def create_convolution_descriptor(pad, stride, dtype, + mode=cudnn.CUDNN_CROSS_CORRELATION, + dilation=None, + use_tensor_core=False, + groups=1): + desc = Descriptor(cudnn.createConvolutionDescriptor(), + _py_cudnn.destroyConvolutionDescriptor) + _create_convolution_descriptor( + desc.value, pad, stride, dilation, groups, + dtype, mode, use_tensor_core) + return desc + + + cdef _create_pooling_descriptor( + size_t desc, tuple ksize, tuple stride, tuple pad, int mode): + cdef vector.vector[int] c_ksize, c_pad, c_stride + cdef int ndim = len(ksize) + if ndim != len(stride) or ndim != len(pad): + raise ValueError('ksize, stride, and pad must be of same length') + if ndim == 2: + cudnn.setPooling2dDescriptor_v4( + desc, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ksize[0], + ksize[1], pad[0], pad[1], stride[0], stride[1]) + else: + c_ksize = ksize + c_pad = pad + c_stride = stride + cudnn.setPoolingNdDescriptor_v4( + desc, mode, cudnn.CUDNN_NOT_PROPAGATE_NAN, ndim, + c_ksize.data(), c_pad.data(), + c_stride.data()) + + return desc + + + def create_pooling_descriptor(ksize, stride, pad, int mode): + desc = Descriptor(cudnn.createPoolingDescriptor(), + _py_cudnn.destroyPoolingDescriptor) + _create_pooling_descriptor(desc.value, ksize, stride, pad, mode) + return desc + + + cdef Descriptor _create_rnn_data_descriptor(): + return Descriptor(cudnn.createRNNDataDescriptor(), + _py_cudnn.destroyRNNDataDescriptor) + + + cdef Descriptor _make_unpacked_rnn_data_descriptor(_ndarray_base xs, lengths): + cdef Descriptor descriptor = _create_rnn_data_descriptor() + cdef int data_type = get_data_type(xs.dtype) + cdef Py_ssize_t max_length, batch, n_dim + max_length, batch, n_dim = xs.shape + cudnn.setRNNDataDescriptor( + descriptor.value, data_type, + cudnn.CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED, + max_length, batch, n_dim, + lengths.ctypes.data, 0) + return descriptor + + + def rnn_forward_inference_ex( + DropoutStates states, int direction_mode, int rnn_mode, + _ndarray_base hx, _ndarray_base cx, _ndarray_base w, + _ndarray_base xs, lengths): + hx = core._internal_ascontiguousarray(hx) + if cx is not None: + cx = core._internal_ascontiguousarray(cx) + w = core._internal_ascontiguousarray(w) + xs = core._internal_ascontiguousarray(xs) + + cdef int length = xs._shape[0] + cdef int n_layers = _get_n_layers(direction_mode, hx) + cdef int n_units = hx._shape[2] + + cdef _ndarray_base ys = _make_rnn_result_array(direction_mode, n_units, xs) + cdef _ndarray_base hy = _core.ndarray(hx.shape, hx.dtype) + if cx is None: + cx = _core.ndarray(0, dtype=xs.dtype) + cdef _ndarray_base cy = _core.ndarray(cx.shape, cx.dtype) + + cdef size_t handle = get_handle() + + cdef Descriptor rnn_desc = create_rnn_descriptor( + n_units, n_layers, states._desc, + cudnn.CUDNN_LINEAR_INPUT, direction_mode, + rnn_mode, get_data_type(xs.dtype)) + cudnn.setRNNPaddingMode( + rnn_desc.value, cudnn.CUDNN_RNN_PADDED_IO_ENABLED) + + cdef Descriptor x_data_desc = _make_unpacked_rnn_data_descriptor( + xs, lengths) + cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) + cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) + cdef Descriptor w_desc = create_filter_descriptor(w) + cdef Descriptor y_data_desc = _make_unpacked_rnn_data_descriptor( + ys, lengths) + cdef Descriptor hy_desc = create_tensor_nd_descriptor(hy) + cdef Descriptor cy_desc = create_tensor_nd_descriptor(cy) + + cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array_for_padded( + xs) + cdef _memory.MemoryPointer workspace = _make_rnn_workspace( + rnn_desc, length, xs_descs) + + cudnn.RNNForwardInferenceEx( + handle, rnn_desc.value, + x_data_desc.value, xs.data.ptr, + hx_desc.value, hx.data.ptr, + cx_desc.value, cx.data.ptr, + w_desc.value, w.data.ptr, + y_data_desc.value, ys.data.ptr, + hy_desc.value, hy.data.ptr, + cy_desc.value, cy.data.ptr, + 0, 0, 0, 0, 0, 0, 0, 0, + workspace.ptr, workspace.mem.size) + + return hy, cy, ys + + + def rnn_forward_training_ex( + DropoutStates states, int direction_mode, int rnn_mode, + _ndarray_base hx, _ndarray_base cx, _ndarray_base w, _ndarray_base xs, + lengths): + hx = core._internal_ascontiguousarray(hx) + if cx is not None: + cx = core._internal_ascontiguousarray(cx) + w = core._internal_ascontiguousarray(w) + xs = core._internal_ascontiguousarray(xs) + + cdef int length = xs._shape[0] + cdef int n_layers = _get_n_layers(direction_mode, hx) + cdef int n_units = hx._shape[2] + + cdef size_t handle = get_handle() + + cdef Descriptor rnn_desc = create_rnn_descriptor( + n_units, n_layers, states._desc, + cudnn.CUDNN_LINEAR_INPUT, direction_mode, + rnn_mode, get_data_type(xs.dtype)) + cudnn.setRNNPaddingMode( + rnn_desc.value, cudnn.CUDNN_RNN_PADDED_IO_ENABLED) + + cdef _ndarray_base ys = _make_rnn_result_array(direction_mode, n_units, xs) + cdef _ndarray_base hy = _core.ndarray(hx.shape, hx.dtype) + if cx is None: + cx = _core.ndarray(0, dtype=xs.dtype) + cdef _ndarray_base cy = _core.ndarray(cx.shape, cx.dtype) + + cdef Descriptor x_data_desc = _make_unpacked_rnn_data_descriptor( + xs, lengths) + cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) + cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) + cdef Descriptor w_desc = create_filter_descriptor(w) + cdef Descriptor y_data_desc = _make_unpacked_rnn_data_descriptor( + ys, lengths) + cdef Descriptor hy_desc = create_tensor_nd_descriptor(hy) + cdef Descriptor cy_desc = create_tensor_nd_descriptor(cy) + + cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array_for_padded( + xs) + cdef _memory.MemoryPointer workspace = _make_rnn_workspace( + rnn_desc, length, xs_descs) + cdef _memory.MemoryPointer reserve_space = _make_rnn_reserve_space( + rnn_desc, length, xs_descs) + + cudnn.RNNForwardTrainingEx( + handle, rnn_desc.value, + x_data_desc.value, xs.data.ptr, + hx_desc.value, hx.data.ptr, + cx_desc.value, cx.data.ptr, + w_desc.value, w.data.ptr, + y_data_desc.value, ys.data.ptr, + hy_desc.value, hy.data.ptr, + cy_desc.value, cy.data.ptr, + 0, 0, 0, 0, 0, 0, 0, 0, + workspace.ptr, workspace.mem.size, + reserve_space.ptr, reserve_space.mem.size) + + return reserve_space, hy, cy, ys + + + def rnn_backward_data_ex( + DropoutStates states, int direction_mode, int rnn_mode, + _ndarray_base hx, _ndarray_base cx, _ndarray_base w, _ndarray_base xs, + _ndarray_base ys, _memory.MemoryPointer reserve_space, + _ndarray_base dhy, _ndarray_base dcy, _ndarray_base dys, + lengths): + hx = core._internal_ascontiguousarray(hx) + if cx is not None: + cx = core._internal_ascontiguousarray(cx) + w = core._internal_ascontiguousarray(w) + xs = core._internal_ascontiguousarray(xs) + ys = core._internal_ascontiguousarray(ys) + dhy = core._internal_ascontiguousarray(dhy) + if dcy is not None: + dcy = core._internal_ascontiguousarray(dcy) + dys = core._internal_ascontiguousarray(dys) + + cdef int length = xs._shape[0] + cdef int n_layers = _get_n_layers(direction_mode, hx) + cdef int n_units = hx._shape[2] + + cdef size_t handle = get_handle() + cdef Descriptor rnn_desc = create_rnn_descriptor( + n_units, n_layers, states._desc, + cudnn.CUDNN_LINEAR_INPUT, direction_mode, + rnn_mode, get_data_type(xs.dtype)) + cudnn.setRNNPaddingMode( + rnn_desc.value, cudnn.CUDNN_RNN_PADDED_IO_ENABLED) + + cdef _ndarray_base dxs = _core.ndarray(xs.shape, xs.dtype) + cdef _ndarray_base dhx = _core.ndarray(hx.shape, hx.dtype) + if cx is None: + cx = dcy = _core.ndarray(0, dtype=xs.dtype) + cdef _ndarray_base dcx = _core.ndarray(cx.shape, cx.dtype) + + cdef Descriptor y_data_desc = _make_unpacked_rnn_data_descriptor( + ys, lengths) + cdef Descriptor dy_data_desc = _make_unpacked_rnn_data_descriptor( + dys, lengths) + cdef Descriptor dhy_desc = create_tensor_nd_descriptor(dhy) + cdef Descriptor dcy_desc = create_tensor_nd_descriptor(dcy) + cdef Descriptor w_desc = create_filter_descriptor(w) + cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) + cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) + cdef Descriptor dx_data_desc = _make_unpacked_rnn_data_descriptor( + dxs, lengths) + cdef Descriptor dhx_desc = create_tensor_nd_descriptor(dhx) + cdef Descriptor dcx_desc = create_tensor_nd_descriptor(dcx) + + cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array_for_padded( + xs) + cdef _memory.MemoryPointer workspace = _make_rnn_workspace( + rnn_desc, length, xs_descs) + + cudnn.RNNBackwardDataEx( + handle, rnn_desc.value, + y_data_desc.value, ys.data.ptr, + dy_data_desc.value, dys.data.ptr, + 0, 0, + dhy_desc.value, dhy.data.ptr, + dcy_desc.value, dcy.data.ptr, + w_desc.value, w.data.ptr, + hx_desc.value, hx.data.ptr, + cx_desc.value, cx.data.ptr, + dx_data_desc.value, dxs.data.ptr, + dhx_desc.value, dhx.data.ptr, + dcx_desc.value, dcx.data.ptr, + 0, 0, + workspace.ptr, workspace.mem.size, + reserve_space.ptr, reserve_space.mem.size) + + return dhx, dcx, dxs + + + def rnn_backward_weights_ex( + DropoutStates states, int direction_mode, int rnn_mode, + _ndarray_base xs, _ndarray_base hx, _ndarray_base ys, + _ndarray_base w, + _memory.MemoryPointer reserve_space, lengths): + xs = core._internal_ascontiguousarray(xs) + hx = core._internal_ascontiguousarray(hx) + ys = core._internal_ascontiguousarray(ys) + w = core._internal_ascontiguousarray(w) + + cdef int length = xs._shape[0] + cdef int n_layers = _get_n_layers(direction_mode, hx) + cdef int n_units = hx._shape[2] + + cdef size_t handle = get_handle() + cdef Descriptor rnn_desc = create_rnn_descriptor( + n_units, n_layers, states._desc, + cudnn.CUDNN_LINEAR_INPUT, direction_mode, + rnn_mode, get_data_type(xs.dtype)) + cudnn.setRNNPaddingMode( + rnn_desc.value, cudnn.CUDNN_RNN_PADDED_IO_ENABLED) + + cdef Descriptor x_data_desc = _make_unpacked_rnn_data_descriptor( + xs, lengths) + cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) + cdef Descriptor y_data_desc = _make_unpacked_rnn_data_descriptor( + ys, lengths) + + cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array_for_padded( + xs) + cdef _memory.MemoryPointer workspace = _make_rnn_workspace( + rnn_desc, length, xs_descs) + + cdef _ndarray_base dw = _core.ndarray(w.shape, w.dtype) + dw.fill(0) + cdef Descriptor dw_desc = create_filter_descriptor(dw) + + cudnn.RNNBackwardWeightsEx( + handle, rnn_desc.value, + x_data_desc.value, xs.data.ptr, + hx_desc.value, hx.data.ptr, + y_data_desc.value, ys.data.ptr, + workspace.ptr, workspace.mem.size, + dw_desc.value, dw.data.ptr, + reserve_space.ptr, reserve_space.mem.size) + return dw + + cdef int _create_tensor_descriptor_for_softmax( + size_t desc, _ndarray_base arr, int axis) except?-1: + cdef Py_ssize_t left, center, right + assert arr._c_contiguous + data_type = get_data_type(arr.dtype) + if axis < 0: + axis += arr._shape.size() + left = 1 + for i in range(0, axis): + left *= arr._shape[i] + center = arr._shape[axis] + right = 1 + for i in range(axis + 1, arr._shape.size()): + right *= arr._shape[i] + cudnn.setTensor4dDescriptor(desc, cudnn.CUDNN_TENSOR_NCHW, data_type, + left, center, right, 1) + if center == 1 and right == 1: + return cudnn.CUDNN_SOFTMAX_MODE_INSTANCE + else: + return cudnn.CUDNN_SOFTMAX_MODE_CHANNEL + + + def softmax_forward(_ndarray_base x, int axis, int algorithm): + cdef float float_zero = 0, float_one = 1 + cdef double double_zero = 0, double_one = 1 + cdef size_t zero, one + cdef _ndarray_base y + if x.dtype == 'd': + zero = &double_zero + one = &double_one + else: + zero = &float_zero + one = &float_one + x = core._internal_ascontiguousarray(x) y = _core.ndarray(x._shape, x.dtype) - - x_desc = cudnn.createTensorDescriptor() + + handle = get_handle() + desc = cudnn.createTensorDescriptor() try: - _create_tensor_descriptor_as4darray(x_desc, x) - reserve_size = cudnn.getDropoutReserveSpaceSize(x_desc) - reserve_space = _core.ndarray((reserve_size,), 'b') - - cudnn.dropoutForward(cudnn_handle, self._desc.value, - x_desc, x.data.ptr, x_desc, y.data.ptr, - reserve_space.data.ptr, reserve_size) + cudnn_mode = _create_tensor_descriptor_for_softmax(desc, x, axis) + cudnn.softmaxForward( + handle, algorithm, cudnn_mode, + one, desc, x.data.ptr, zero, desc, y.data.ptr) finally: - cudnn.destroyTensorDescriptor(x_desc) - return reserve_space, y - - def backward(self, handle, _ndarray_base dy, dropout_ratio, - _ndarray_base reserve_space): - cdef _ndarray_base dx - cdef size_t cudnn_handle - # This is for backward compatibility. - if handle is None: - cudnn_handle = get_handle() + cudnn.destroyTensorDescriptor(desc) + return y + + + def softmax_backward( + _ndarray_base y, _ndarray_base gy, int axis, int algorithm): + cdef float float_zero = 0, float_one = 1 + cdef double double_zero = 0, double_one = 1 + cdef size_t zero, one + cdef _ndarray_base gx + if y.dtype == 'd': + zero = &double_zero + one = &double_one else: - cudnn_handle = handle - set_dropout_descriptor(self._desc, cudnn_handle, dropout_ratio) - - dy = core._internal_ascontiguousarray(dy) - dx = _core.ndarray(dy._shape, dy.dtype) - - dy_desc = cudnn.createTensorDescriptor() + zero = &float_zero + one = &float_one + + gx = _core.ndarray(y._shape, y.dtype) + y = core._internal_ascontiguousarray(y) + gy = core._internal_ascontiguousarray(gy) + + handle = get_handle() + desc = cudnn.createTensorDescriptor() try: - _create_tensor_descriptor_as4darray(dy_desc, dy) - cudnn.dropoutBackward(cudnn_handle, self._desc.value, - dy_desc, dy.data.ptr, - dy_desc, dx.data.ptr, - reserve_space.data.ptr, - reserve_space.size) + cudnn_mode = _create_tensor_descriptor_for_softmax(desc, y, axis) + cudnn.softmaxBackward( + handle, algorithm, cudnn_mode, + one, desc, y.data.ptr, desc, gy.data.ptr, zero, desc, gx.data.ptr) finally: - cudnn.destroyTensorDescriptor(dy_desc) - return dx - - -cdef class _Algorithm: - cdef: - int algo - int mathType - size_t memory - - def __cinit__(self, int algo, size_t memory, int mathType=0): - self.algo = algo - self.memory = memory - self.mathType = mathType - - -cdef dict _get_algorithm_fwd_cache = {} -cdef dict _get_algorithm_bwd_filter_cache = {} -cdef dict _get_algorithm_bwd_data_cache = {} -cdef dict _algorithm_fwd_cache = {} -cdef dict _algorithm_bwd_filter_cache = {} -cdef dict _algorithm_bwd_data_cache = {} - - -cpdef _warn_algorithm_fwd( - _ndarray_base x, _ndarray_base W, _ndarray_base y, tuple conv_param): - _warnings.warn( - 'Tensor Core mode is set but the selected convolution forward ' - 'algorithm is not a Tensor Core enabled algorithm. ' - 'This might be due to lack of workspace memory. ' - 'x.shape:{}, W.shape:{}, y.shape:{}, pad:{}, stride:{}' - .format(x.shape, W.shape, y.shape, conv_param[0], conv_param[1]), - _util.PerformanceWarning) - - -cpdef _Algorithm _find_algorithm_fwd( - _ndarray_base x, _ndarray_base W, _ndarray_base y, tuple conv_param, - size_t handle, size_t x_desc, size_t filter_desc, size_t conv_desc, - size_t y_desc, size_t max_workspace_size, bint use_tensor_core): - cdef cudnn.CuDNNAlgoPerf perf - key = (x.data.device.id, x.shape, W.shape, y.shape, conv_param, - max_workspace_size) - algo = _algorithm_fwd_cache.get(key, None) - if algo is not None: - return algo - workspace = _memory.alloc(max_workspace_size) - if cudnn_version() >= 7000: - perf = cudnn.findConvolutionForwardAlgorithmEx_v7( - handle, x_desc, x.data.ptr, filter_desc, W.data.ptr, conv_desc, - y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0] - if use_tensor_core and perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH: - _warn_algorithm_fwd(x, W, y, conv_param) - else: - perf = cudnn.findConvolutionForwardAlgorithmEx( - handle, x_desc, x.data.ptr, filter_desc, W.data.ptr, conv_desc, - y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0] - if perf.status != cudnn.CUDNN_STATUS_SUCCESS: - raise RuntimeError('No available algorithm found.') - algo = _Algorithm(perf.algo, perf.memory, perf.mathType) - _algorithm_fwd_cache[key] = algo - return algo - - -cpdef _Algorithm _get_algorithm_fwd( - _ndarray_base x, _ndarray_base W, _ndarray_base y, tuple conv_param, - size_t handle, size_t x_desc, size_t filter_desc, size_t conv_desc, - size_t y_desc, size_t max_workspace_size, bint use_tensor_core): - cdef cudnn.CuDNNAlgoPerf perf - key = (x.data.device.id, x.shape, W.shape, y.shape, conv_param, - max_workspace_size) - algo = _get_algorithm_fwd_cache.get(key, None) - if algo is not None: - return algo - cdef list ret - cdef bint skip - cdef int cudnn_ver = cudnn_version() - if (use_tensor_core and cudnn_ver >= 7000) or cudnn_ver >= 8000: - ret = cudnn.getConvolutionForwardAlgorithm_v7( - handle, x_desc, filter_desc, conv_desc, y_desc, 10) - skip = False - for perf in ret: - if perf.memory <= max_workspace_size: - break - skip = True + cudnn.destroyTensorDescriptor(desc) + return gx + + + def create_dropout_descriptor( + handle, dropout, states, state_size_in_bytes, seed): + desc = Descriptor(cudnn.createDropoutDescriptor(), + _py_cudnn.destroyDropoutDescriptor) + cudnn.setDropoutDescriptor(desc.value, handle, dropout, + states, state_size_in_bytes, seed) + return desc + + + def set_dropout_descriptor(desc, handle, dropout): + # When the fourth argument is NULL, random state is not updated. + cudnn.setDropoutDescriptor(desc.value, handle, dropout, 0, 0, 0) + + + def _create_ctc_loss_descriptor(data_type): + desc = Descriptor(cudnn.createCTCLossDescriptor(), + _py_cudnn.destroyCTCLossDescriptor) + cudnn.setCTCLossDescriptor(desc.value, data_type) + return desc + + + def ctc_loss(_ndarray_base probs, labels, + label_length, input_length, int algo): + batch_size = probs.shape[1] + labels_ptr = labels.ctypes.data + label_length_ptr = label_length.ctypes.data + input_length_ptr = input_length.ctypes.data + handle = get_handle() + data_type = get_data_type(probs.dtype) + ctc_desc = Descriptor(cudnn.createCTCLossDescriptor(), + _py_cudnn.destroyCTCLossDescriptor) + cudnn.setCTCLossDescriptor(ctc_desc.value, data_type) + + gradients = _core.ndarray(probs._shape, probs.dtype) + loss = _core.ndarray((batch_size, ), 'f') + probs_desc = create_tensor_descriptor(probs) + gradients_desc = create_tensor_descriptor(gradients) + + work_size = cudnn.getCTCLossWorkspaceSize( + handle, probs_desc.value, gradients_desc.value, + labels_ptr, label_length_ptr, + input_length_ptr, algo, ctc_desc.value) + workspace = _core.ndarray((work_size,), 'b') + + cudnn.CTCLoss(handle, probs_desc.value, probs.data.ptr, + labels_ptr, label_length_ptr, + input_length_ptr, loss.data.ptr, gradients_desc.value, + gradients.data.ptr, algo, ctc_desc.value, + workspace.data.ptr, work_size) + return loss, gradients + + + def create_rnn_descriptor(hidden_size, num_layers, dropout_desc, + input_mode, direction, mode, data_type, algo=None): + desc = Descriptor(cudnn.createRNNDescriptor(), + _py_cudnn.destroyRNNDescriptor) + if cudnn_version() >= 6000: + _handle = get_handle() + if algo is None: + algo = cudnn.CUDNN_RNN_ALGO_STANDARD + cudnn.setRNNDescriptor_v6( + _handle, desc.value, hidden_size, num_layers, dropout_desc.value, + input_mode, direction, mode, algo, data_type) else: - raise RuntimeError('No conv fwd algo available with workspace size' - ' less equal {}'.format(max_workspace_size)) - if skip: - _warnings.warn( - 'The best algo of conv fwd might not be selected due to ' - 'lack of workspace size ({})'.format(max_workspace_size), - _util.PerformanceWarning) - algo = perf.algo - workspace_size = perf.memory - math_type = perf.mathType - if use_tensor_core and math_type != cudnn.CUDNN_TENSOR_OP_MATH: - _warn_algorithm_fwd(x, W, y, conv_param) + cudnn.setRNNDescriptor_v5( + desc.value, hidden_size, num_layers, dropout_desc.value, + input_mode, direction, mode, data_type) + return desc + + + def get_rnn_lin_layer_matrix_params( + handle, rnn_desc, layer, x_desc, w_desc, _ndarray_base w, + lin_layer_id): + cdef size_t ptr = 0 + mat_desc = cudnn.createFilterDescriptor() + try: + cudnn.getRNNLinLayerMatrixParams( + handle, rnn_desc.value, layer, x_desc.value, w_desc.value, + w.data.ptr, lin_layer_id, mat_desc, &ptr) + data_type, _, _, dim = cudnn.getFilterNdDescriptor(mat_desc, 3) + finally: + cudnn.destroyFilterDescriptor(mat_desc) + byte_size = _get_byte_size(data_type) + offset = (ptr - w.data.ptr) // byte_size + size = internal.prod_sequence(dim) + mat = w[offset:offset + size] + return mat + + + def get_rnn_lin_layer_bias_params( + handle, rnn_desc, layer, x_desc, w_desc, _ndarray_base w, + lin_layer_id): + cdef size_t ptr = 0 + bias_desc = cudnn.createFilterDescriptor() + try: + cudnn.getRNNLinLayerBiasParams( + handle, rnn_desc.value, layer, x_desc.value, w_desc.value, + w.data.ptr, lin_layer_id, bias_desc, &ptr) + data_type, _, _, dim = cudnn.getFilterNdDescriptor(bias_desc, 3) + finally: + cudnn.destroyFilterDescriptor(bias_desc) + byte_size = _get_byte_size(data_type) + offset = (ptr - w.data.ptr) // byte_size + size = internal.prod_sequence(dim) + bias = w[offset:offset + size] + return bias + + + cdef class _DescriptorArray: + + cdef: + vector.vector[size_t] _value + object _destroy + + def __init__(self, destroyer): + self._destroy = destroyer + + def __dealloc__(self): + for desc in self._value: + self._destroy(desc) + + def append(self, desc): + self._value.push_back(desc) + + @property + def data(self): + return self._value.data() + + + cdef _DescriptorArray _make_tensor_descriptor_array(xs, lengths): + """Make an array of pointers denoting pointers of tensor descriptors. + + """ + cdef _DescriptorArray descs = _DescriptorArray( + _py_cudnn.destroyTensorDescriptor) + cdef size_t desc + cdef int data_type = get_data_type(xs.dtype) + cdef vector.vector[int] c_shape, c_strides + cdef Py_ssize_t itemsize = xs.itemsize + cdef Py_ssize_t s + cdef int length + + # RNN APIs assumes ndim == 3. + for s in xs._strides: + c_strides.push_back(s // itemsize) + for _ in range(3 - len(xs._strides)): + c_strides.push_back(1) + for s in xs._shape: + c_shape.push_back(s) + for _ in range(3 - len(xs._strides)): + c_shape.push_back(1) + + for length in lengths: + c_shape[0] = length + desc = cudnn.createTensorDescriptor() + descs.append(desc) + cudnn.setTensorNdDescriptor( + desc, data_type, 3, + c_shape.data(), c_strides.data()) + + return descs + + + cdef _DescriptorArray _make_tensor_descriptor_array_for_padded(xs): + assert xs.ndim == 3 + + cdef _DescriptorArray descs = _DescriptorArray( + _py_cudnn.destroyTensorDescriptor) + cdef size_t desc + cdef int data_type = get_data_type(xs.dtype) + cdef Py_ssize_t itemsize = xs.itemsize + + # RNN APIs assumes ndim == 3. + cdef vector.vector[int] c_shape = [xs._shape[1], xs._shape[2], 1] + cdef vector.vector[int] c_strides = [ + xs._strides[1] // itemsize, xs._strides[2] // itemsize, 1] + + for _ in range(xs._shape[0]): + desc = cudnn.createTensorDescriptor() + descs.append(desc) + cudnn.setTensorNdDescriptor( + desc, data_type, 3, + c_shape.data(), c_strides.data()) + + return descs + + + cdef _memory.MemoryPointer _make_rnn_workspace( + Descriptor rnn_desc, int length, _DescriptorArray descs): + cdef size_t handle = get_handle() + cdef size_t work_size = cudnn.getRNNWorkspaceSize( + handle, rnn_desc.value, length, descs.data) + return _memory.alloc(work_size) + + + cdef _memory.MemoryPointer _make_rnn_reserve_space( + Descriptor rnn_desc, int length, _DescriptorArray descs): + cdef size_t handle = get_handle() + cdef size_t reserve_size = cudnn.getRNNTrainingReserveSize( + handle, rnn_desc.value, length, descs.data) + return _memory.alloc(reserve_size) + + + cdef Py_ssize_t _get_n_layers(int direction_mode, _ndarray_base hx): + if direction_mode == cudnn.CUDNN_BIDIRECTIONAL: + return hx._shape[0] // 2 + else: # cudnn.CUDNN_UNIDIRECTIONAL + return hx._shape[0] + + + cdef _ndarray_base _make_rnn_result_array( + int direction_mode, Py_ssize_t n_units, _ndarray_base xs): + cdef int output_units + if direction_mode == cudnn.CUDNN_BIDIRECTIONAL: + output_units = n_units * 2 + else: # cudnn.CUDNN_UNIDIRECTIONAL + output_units = n_units + + shape = xs.shape[:-1] + (output_units,) + return _core.ndarray(shape, dtype=xs.dtype) + + + def rnn_forward_inference( + DropoutStates states, int direction_mode, int rnn_mode, + _ndarray_base hx, _ndarray_base cx, _ndarray_base w, _ndarray_base xs, + lengths): + hx = core._internal_ascontiguousarray(hx) + if cx is not None: + cx = core._internal_ascontiguousarray(cx) + w = core._internal_ascontiguousarray(w) + xs = core._internal_ascontiguousarray(xs) + + cdef int length = len(lengths) + cdef int n_layers = _get_n_layers(direction_mode, hx) + cdef int n_units = hx.shape[2] + + cdef size_t handle = get_handle() + + cdef Descriptor rnn_desc = create_rnn_descriptor( + n_units, n_layers, states._desc, + cudnn.CUDNN_LINEAR_INPUT, direction_mode, + rnn_mode, get_data_type(xs.dtype)) + + cdef _ndarray_base ys = _make_rnn_result_array(direction_mode, n_units, xs) + cdef _ndarray_base hy = _core.ndarray(hx.shape, hx.dtype) + if cx is None: + cx = _core.ndarray(0, dtype=xs.dtype) + cdef _ndarray_base cy = _core.ndarray(cx.shape, cx.dtype) + + cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array(xs, lengths) + cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) + cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) + cdef Descriptor w_desc = create_filter_descriptor(w) + cdef _DescriptorArray ys_descs = _make_tensor_descriptor_array(ys, lengths) + cdef Descriptor hy_desc = create_tensor_nd_descriptor(hy) + cdef Descriptor cy_desc = create_tensor_nd_descriptor(cy) + + cdef _memory.MemoryPointer workspace = _make_rnn_workspace( + rnn_desc, length, xs_descs) + + cudnn.RNNForwardInference( + handle, rnn_desc.value, length, + xs_descs.data, xs.data.ptr, hx_desc.value, hx.data.ptr, + cx_desc.value, cx.data.ptr, w_desc.value, w.data.ptr, + ys_descs.data, ys.data.ptr, hy_desc.value, hy.data.ptr, + cy_desc.value, cy.data.ptr, workspace.ptr, workspace.mem.size) + + return hy, cy, ys + + + def rnn_forward_training( + DropoutStates states, int direction_mode, int rnn_mode, + _ndarray_base hx, _ndarray_base cx, _ndarray_base w, _ndarray_base xs, + lengths): + hx = core._internal_ascontiguousarray(hx) + if cx is not None: + cx = core._internal_ascontiguousarray(cx) + w = core._internal_ascontiguousarray(w) + xs = core._internal_ascontiguousarray(xs) + + cdef int length = len(lengths) + cdef int n_layers = _get_n_layers(direction_mode, hx) + cdef int n_units = hx.shape[2] + + cdef size_t handle = get_handle() + + cdef Descriptor rnn_desc = create_rnn_descriptor( + n_units, n_layers, states._desc, + cudnn.CUDNN_LINEAR_INPUT, direction_mode, + rnn_mode, get_data_type(xs.dtype)) + + cdef _ndarray_base ys = _make_rnn_result_array(direction_mode, n_units, xs) + cdef _ndarray_base hy = _core.ndarray(hx.shape, hx.dtype) + if cx is None: + cx = _core.ndarray(0, dtype=xs.dtype) + cdef _ndarray_base cy = _core.ndarray(cx.shape, cx.dtype) + + cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array(xs, lengths) + cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) + cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) + cdef Descriptor w_desc = create_filter_descriptor(w) + cdef _DescriptorArray ys_descs = _make_tensor_descriptor_array(ys, lengths) + cdef Descriptor hy_desc = create_tensor_nd_descriptor(hy) + cdef Descriptor cy_desc = create_tensor_nd_descriptor(cy) + + cdef _memory.MemoryPointer workspace = _make_rnn_workspace( + rnn_desc, length, xs_descs) + cdef _memory.MemoryPointer reserve_space = _make_rnn_reserve_space( + rnn_desc, length, xs_descs) + + cudnn.RNNForwardTraining( + handle, rnn_desc.value, length, + xs_descs.data, xs.data.ptr, hx_desc.value, hx.data.ptr, + cx_desc.value, cx.data.ptr, w_desc.value, w.data.ptr, + ys_descs.data, ys.data.ptr, hy_desc.value, hy.data.ptr, + cy_desc.value, cy.data.ptr, workspace.ptr, workspace.mem.size, + reserve_space.ptr, reserve_space.mem.size) + + return reserve_space, hy, cy, ys + + + def rnn_backward_data( + DropoutStates states, int direction_mode, int rnn_mode, + _ndarray_base hx, _ndarray_base cx, _ndarray_base w, _ndarray_base xs, + _ndarray_base ys, _memory.MemoryPointer reserve_space, + _ndarray_base dhy, _ndarray_base dcy, _ndarray_base dys, + lengths): + hx = core._internal_ascontiguousarray(hx) + if cx is not None: + cx = core._internal_ascontiguousarray(cx) + w = core._internal_ascontiguousarray(w) + xs = core._internal_ascontiguousarray(xs) + ys = _ascontiguousarray_normalized_strides(ys) + dhy = core._internal_ascontiguousarray(dhy) + if dcy is not None: + dcy = core._internal_ascontiguousarray(dcy) + dys = _ascontiguousarray_normalized_strides(dys) + + cdef int length = len(lengths) + cdef int n_layers = _get_n_layers(direction_mode, hx) + cdef int n_units = hx.shape[2] + + cdef size_t handle = get_handle() + cdef Descriptor rnn_desc = create_rnn_descriptor( + n_units, n_layers, states._desc, + cudnn.CUDNN_LINEAR_INPUT, direction_mode, + rnn_mode, get_data_type(xs.dtype)) + + cdef _ndarray_base dxs = _core.ndarray(xs.shape, xs.dtype) + cdef _ndarray_base dhx = _core.ndarray(hx.shape, hx.dtype) + if cx is None: + cx = dcy = _core.ndarray(0, dtype=xs.dtype) + cdef _ndarray_base dcx = _core.ndarray(cx.shape, cx.dtype) + + cdef _DescriptorArray ys_descs = _make_tensor_descriptor_array(ys, lengths) + cdef _DescriptorArray dys_descs = _make_tensor_descriptor_array( + dys, lengths) + cdef Descriptor dhy_desc = create_tensor_nd_descriptor(dhy) + cdef Descriptor dcy_desc = create_tensor_nd_descriptor(dcy) + cdef Descriptor w_desc = create_filter_descriptor(w) + cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) + cdef Descriptor cx_desc = create_tensor_nd_descriptor(cx) + cdef _DescriptorArray dxs_descs = _make_tensor_descriptor_array( + dxs, lengths) + cdef Descriptor dhx_desc = create_tensor_nd_descriptor(dhx) + cdef Descriptor dcx_desc = create_tensor_nd_descriptor(dcx) + + cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array(xs, lengths) + cdef _memory.MemoryPointer workspace = _make_rnn_workspace( + rnn_desc, length, xs_descs) + + cudnn.RNNBackwardData( + handle, rnn_desc.value, length, + ys_descs.data, ys.data.ptr, + dys_descs.data, dys.data.ptr, dhy_desc.value, dhy.data.ptr, + dcy_desc.value, dcy.data.ptr, w_desc.value, w.data.ptr, + hx_desc.value, hx.data.ptr, cx_desc.value, cx.data.ptr, + dxs_descs.data, dxs.data.ptr, dhx_desc.value, dhx.data.ptr, + dcx_desc.value, dcx.data.ptr, workspace.ptr, workspace.mem.size, + reserve_space.ptr, reserve_space.mem.size) + + return dhx, dcx, dxs + + + def rnn_backward_weights( + DropoutStates states, int direction_mode, int rnn_mode, + _ndarray_base xs, _ndarray_base hx, _ndarray_base ys, + _ndarray_base w, + _memory.MemoryPointer reserve_space, lengths): + xs = core._internal_ascontiguousarray(xs) + hx = core._internal_ascontiguousarray(hx) + ys = core._internal_ascontiguousarray(ys) + w = core._internal_ascontiguousarray(w) + + cdef int length = len(lengths) + cdef int n_layers = _get_n_layers(direction_mode, hx) + cdef int n_units = hx.shape[2] + + cdef size_t handle = get_handle() + cdef Descriptor rnn_desc = create_rnn_descriptor( + n_units, n_layers, states._desc, + cudnn.CUDNN_LINEAR_INPUT, direction_mode, + rnn_mode, get_data_type(xs.dtype)) + + cdef _DescriptorArray xs_descs = _make_tensor_descriptor_array(xs, lengths) + cdef Descriptor hx_desc = create_tensor_nd_descriptor(hx) + cdef _DescriptorArray ys_descs = _make_tensor_descriptor_array(ys, lengths) + + cdef _memory.MemoryPointer workspace = _make_rnn_workspace( + rnn_desc, length, xs_descs) + + cdef _ndarray_base dw = _core.ndarray(w.shape, w.dtype) + dw.fill(0) + cdef Descriptor dw_desc = create_filter_descriptor(dw) + + cudnn.RNNBackwardWeights( + handle, rnn_desc.value, length, + xs_descs.data, xs.data.ptr, + hx_desc.value, hx.data.ptr, ys_descs.data, ys.data.ptr, + workspace.ptr, workspace.mem.size, dw_desc.value, dw.data.ptr, + reserve_space.ptr, reserve_space.mem.size) + return dw + + + def create_dropout_states(handle): + _warnings.warn('create_dropout_states is deprecated.' + 'Please use DropoutStates class instead.', + DeprecationWarning) + state_size = cudnn.dropoutGetStatesSize(handle) + return _core.ndarray((state_size,), 'b') + + + def create_spatial_transformer_descriptor(sampler_type, dtype, nb_dims, dim_A): + desc = Descriptor(cudnn.createSpatialTransformerDescriptor(), + _py_cudnn.destroySpatialTransformerDescriptor) + data_type = get_data_type(dtype) + + cudnn.setSpatialTransformerDescriptor( + desc.value, sampler_type, data_type, nb_dims, dim_A) + return desc + + + def add_tensor(handle, alpha, biasDesc, biasData, beta, srcDestDesc, + srcDestData): + cudnn.addTensor_v3(handle, alpha, biasDesc, + biasData, beta, srcDestDesc, srcDestData) + + + def create_op_tensor_descriptor(op_type, dtype): + desc = Descriptor(cudnn.createOpTensorDescriptor(), + _py_cudnn.destroyOpTensorDescriptor) + data_type = get_data_type(dtype) + + cudnn.setOpTensorDescriptor(desc.value, op_type, data_type, + cudnn.CUDNN_NOT_PROPAGATE_NAN) + return desc + + + def create_reduce_tensor_descriptor(reduce_type, dtype): + desc = Descriptor(cudnn.createReduceTensorDescriptor(), + _py_cudnn.destroyReduceTensorDescriptor) + data_type = get_data_type(dtype) + if reduce_type in (cudnn.CUDNN_REDUCE_TENSOR_MIN, + cudnn.CUDNN_REDUCE_TENSOR_MAX): + indices = cudnn.CUDNN_REDUCE_TENSOR_FLATTENED_INDICES + else: + indices = cudnn.CUDNN_REDUCE_TENSOR_NO_INDICES + + cudnn.setReduceTensorDescriptor(desc.value, reduce_type, data_type, + cudnn.CUDNN_NOT_PROPAGATE_NAN, + indices, + cudnn.CUDNN_32BIT_INDICES) + return desc + + + cpdef bint is_tensor_core_available(dtype) except *: + return (cudnn_version() >= 7000 and + (dtype.char) == 'e' and + int(device.get_compute_capability()) == 70) + + + cdef class DropoutStates: + + cdef public: + # TODO(unno): Make these attributes private. This is for backward + # compatibility. + _memory.MemoryPointer _states + Descriptor _desc + + def __init__(self, handle, seed): + cdef size_t cudnn_handle + if handle is None: + cudnn_handle = get_handle() + else: + cudnn_handle = handle + state_size = cudnn.dropoutGetStatesSize(cudnn_handle) + self._states = _memory.alloc(state_size) + self._desc = create_dropout_descriptor( + cudnn_handle, 0., self._states.ptr, + state_size, seed) + + def set_dropout_ratio(self, dropout_ratio): + cudnn_handle = get_handle() + set_dropout_descriptor(self._desc, cudnn_handle, dropout_ratio) + + def forward(self, handle, _ndarray_base x, dropout_ratio): + cdef _ndarray_base y, reserve_space + cdef size_t cudnn_handle + # This is for backward compatibility. + if handle is None: + cudnn_handle = get_handle() + else: + cudnn_handle = handle + set_dropout_descriptor(self._desc, cudnn_handle, dropout_ratio) + + x = core._internal_ascontiguousarray(x) + y = _core.ndarray(x._shape, x.dtype) + + x_desc = cudnn.createTensorDescriptor() + try: + _create_tensor_descriptor_as4darray(x_desc, x) + reserve_size = cudnn.getDropoutReserveSpaceSize(x_desc) + reserve_space = _core.ndarray((reserve_size,), 'b') + + cudnn.dropoutForward(cudnn_handle, self._desc.value, + x_desc, x.data.ptr, x_desc, y.data.ptr, + reserve_space.data.ptr, reserve_size) + finally: + cudnn.destroyTensorDescriptor(x_desc) + return reserve_space, y + + def backward(self, handle, _ndarray_base dy, dropout_ratio, + _ndarray_base reserve_space): + cdef _ndarray_base dx + cdef size_t cudnn_handle + # This is for backward compatibility. + if handle is None: + cudnn_handle = get_handle() + else: + cudnn_handle = handle + set_dropout_descriptor(self._desc, cudnn_handle, dropout_ratio) + + dy = core._internal_ascontiguousarray(dy) + dx = _core.ndarray(dy._shape, dy.dtype) + + dy_desc = cudnn.createTensorDescriptor() + try: + _create_tensor_descriptor_as4darray(dy_desc, dy) + cudnn.dropoutBackward(cudnn_handle, self._desc.value, + dy_desc, dy.data.ptr, + dy_desc, dx.data.ptr, + reserve_space.data.ptr, + reserve_space.size) + finally: + cudnn.destroyTensorDescriptor(dy_desc) + return dx + + + cdef class _Algorithm: + cdef: + int algo + int mathType + size_t memory + + def __cinit__(self, int algo, size_t memory, int mathType=0): + self.algo = algo + self.memory = memory + self.mathType = mathType + + + cdef dict _get_algorithm_fwd_cache = {} + cdef dict _get_algorithm_bwd_filter_cache = {} + cdef dict _get_algorithm_bwd_data_cache = {} + cdef dict _algorithm_fwd_cache = {} + cdef dict _algorithm_bwd_filter_cache = {} + cdef dict _algorithm_bwd_data_cache = {} + + + cpdef _warn_algorithm_fwd( + _ndarray_base x, _ndarray_base W, _ndarray_base y, tuple conv_param): + _warnings.warn( + 'Tensor Core mode is set but the selected convolution forward ' + 'algorithm is not a Tensor Core enabled algorithm. ' + 'This might be due to lack of workspace memory. ' + 'x.shape:{}, W.shape:{}, y.shape:{}, pad:{}, stride:{}' + .format(x.shape, W.shape, y.shape, conv_param[0], conv_param[1]), + _util.PerformanceWarning) + + + cpdef _Algorithm _find_algorithm_fwd( + _ndarray_base x, _ndarray_base W, _ndarray_base y, tuple conv_param, + size_t handle, size_t x_desc, size_t filter_desc, size_t conv_desc, + size_t y_desc, size_t max_workspace_size, bint use_tensor_core): + cdef cudnn.CuDNNAlgoPerf perf + key = (x.data.device.id, x.shape, W.shape, y.shape, conv_param, + max_workspace_size) + algo = _algorithm_fwd_cache.get(key, None) + if algo is not None: + return algo + workspace = _memory.alloc(max_workspace_size) + IF CUPY_HIP_VERSION == 0: + if cudnn_version() >= 7000: + perf = cudnn.findConvolutionForwardAlgorithmEx_v7( + handle, x_desc, x.data.ptr, filter_desc, W.data.ptr, conv_desc, + y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0] + if use_tensor_core and perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH: + _warn_algorithm_fwd(x, W, y, conv_param) + else: + perf = cudnn.findConvolutionForwardAlgorithmEx( + handle, x_desc, x.data.ptr, filter_desc, W.data.ptr, conv_desc, + y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0] + ELSE: + perf = cudnn.findConvolutionForwardAlgorithmEx( + handle, x_desc, x.data.ptr, filter_desc, W.data.ptr, conv_desc, + y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0] + if perf.status != cudnn.CUDNN_STATUS_SUCCESS: + raise RuntimeError('No available algorithm found.') algo = _Algorithm(perf.algo, perf.memory, perf.mathType) - else: - algo_no = cudnn.getConvolutionForwardAlgorithm_v6( - handle, x_desc, filter_desc, conv_desc, y_desc, - cudnn.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, - max_workspace_size) - workspace_size = cudnn.getConvolutionForwardWorkspaceSize( - handle, x_desc, filter_desc, conv_desc, y_desc, algo_no) - algo = _Algorithm(algo_no, workspace_size) - _get_algorithm_fwd_cache[key] = algo - return algo - - -cpdef _warn_algorithm_bwd_filter( - _ndarray_base x, _ndarray_base dy, _ndarray_base dW, tuple conv_param): - _warnings.warn( - 'Tensor Core mode is set but the selected convolution backward ' - 'filter algorithm is not a Tensor Core enabled algorithm. ' - 'This might be due to lack of workspace memory. ' - 'x.shape:{}, dy.shape:{}, dW.shape:{}, pad:{}, stride:{}' - .format(x.shape, dy.shape, dW.shape, conv_param[0], conv_param[1]), - _util.PerformanceWarning) - - -cpdef _Algorithm _find_algorithm_bwd_filter( - _ndarray_base x, _ndarray_base dy, _ndarray_base dW, tuple conv_param, - size_t handle, size_t x_desc, size_t dy_desc, size_t conv_desc, - size_t filter_desc, size_t max_workspace_size, bint use_tensor_core, - bint deterministic): - cdef cudnn.CuDNNAlgoPerf perf - cdef _Algorithm algo - key = (x.data.device.id, x.shape, dW.shape, dy.shape, conv_param, - max_workspace_size) - algo = _algorithm_bwd_filter_cache.get(key, None) - if algo is not None: + _algorithm_fwd_cache[key] = algo return algo - workspace = _memory.alloc(max_workspace_size) - if cudnn_version() >= 7000: - if deterministic: - ret = cudnn.findConvolutionBackwardFilterAlgorithmEx_v7( - handle, x_desc, x.data.ptr, dy_desc, dy.data.ptr, conv_desc, - filter_desc, dW.data.ptr, 10, workspace.ptr, + + + cpdef _Algorithm _get_algorithm_fwd( + _ndarray_base x, _ndarray_base W, _ndarray_base y, tuple conv_param, + size_t handle, size_t x_desc, size_t filter_desc, size_t conv_desc, + size_t y_desc, size_t max_workspace_size, bint use_tensor_core): + cdef cudnn.CuDNNAlgoPerf perf + key = (x.data.device.id, x.shape, W.shape, y.shape, conv_param, + max_workspace_size) + algo = _get_algorithm_fwd_cache.get(key, None) + if algo is not None: + return algo + cdef list ret + cdef bint skip + cdef int cudnn_ver = cudnn_version() + IF CUPY_HIP_VERSION == 0: + if (use_tensor_core and cudnn_ver >= 7000) or cudnn_ver >= 8000: + ret = cudnn.getConvolutionForwardAlgorithm_v7( + handle, x_desc, filter_desc, conv_desc, y_desc, 10) + skip = False + for perf in ret: + if perf.memory <= max_workspace_size: + break + skip = True + else: + raise RuntimeError('No conv fwd algo available with workspace size' + ' less equal {}'.format(max_workspace_size)) + if skip: + _warnings.warn( + 'The best algo of conv fwd might not be selected due to ' + 'lack of workspace size ({})'.format(max_workspace_size), + _util.PerformanceWarning) + algo = perf.algo + workspace_size = perf.memory + math_type = perf.mathType + if use_tensor_core and math_type != cudnn.CUDNN_TENSOR_OP_MATH: + _warn_algorithm_fwd(x, W, y, conv_param) + algo = _Algorithm(perf.algo, perf.memory, perf.mathType) + else: + algo_no = cudnn.getConvolutionForwardAlgorithm_v6( + handle, x_desc, filter_desc, conv_desc, y_desc, + cudnn.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, + max_workspace_size) + workspace_size = cudnn.getConvolutionForwardWorkspaceSize( + handle, x_desc, filter_desc, conv_desc, y_desc, algo_no) + algo = _Algorithm(algo_no, workspace_size) + ELSE: + algo_no = cudnn.getConvolutionForwardAlgorithm_v6( + handle, x_desc, filter_desc, conv_desc, y_desc, + cudnn.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, max_workspace_size) - for perf in ret: - if perf.determinism: - break + workspace_size = cudnn.getConvolutionForwardWorkspaceSize( + handle, x_desc, filter_desc, conv_desc, y_desc, algo_no) + algo = _Algorithm(algo_no, workspace_size) + _get_algorithm_fwd_cache[key] = algo + return algo + + + cpdef _warn_algorithm_bwd_filter( + _ndarray_base x, _ndarray_base dy, _ndarray_base dW, tuple conv_param): + _warnings.warn( + 'Tensor Core mode is set but the selected convolution backward ' + 'filter algorithm is not a Tensor Core enabled algorithm. ' + 'This might be due to lack of workspace memory. ' + 'x.shape:{}, dy.shape:{}, dW.shape:{}, pad:{}, stride:{}' + .format(x.shape, dy.shape, dW.shape, conv_param[0], conv_param[1]), + _util.PerformanceWarning) + + + cpdef _Algorithm _find_algorithm_bwd_filter( + _ndarray_base x, _ndarray_base dy, _ndarray_base dW, tuple conv_param, + size_t handle, size_t x_desc, size_t dy_desc, size_t conv_desc, + size_t filter_desc, size_t max_workspace_size, bint use_tensor_core, + bint deterministic): + cdef cudnn.CuDNNAlgoPerf perf + cdef _Algorithm algo + key = (x.data.device.id, x.shape, dW.shape, dy.shape, conv_param, + max_workspace_size) + algo = _algorithm_bwd_filter_cache.get(key, None) + if algo is not None: + return algo + workspace = _memory.alloc(max_workspace_size) + IF CUPY_HIP_VERSION == 0: + if cudnn_version() >= 7000: + if deterministic: + ret = cudnn.findConvolutionBackwardFilterAlgorithmEx_v7( + handle, x_desc, x.data.ptr, dy_desc, dy.data.ptr, conv_desc, + filter_desc, dW.data.ptr, 10, workspace.ptr, + max_workspace_size) + for perf in ret: + if perf.determinism: + break + else: + raise RuntimeError( + 'No conv bwd filter algo available with workspace size ' + 'less equal {}'.format(max_workspace_size)) + else: + perf = cudnn.findConvolutionBackwardFilterAlgorithmEx_v7( + handle, x_desc, x.data.ptr, dy_desc, dy.data.ptr, conv_desc, + filter_desc, dW.data.ptr, 1, workspace.ptr, + max_workspace_size)[0] + if use_tensor_core and perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH: + _warn_algorithm_bwd_filter(x, dy, dW, conv_param) else: - raise RuntimeError( - 'No conv bwd filter algo available with workspace size ' - 'less equal {}'.format(max_workspace_size)) - else: - perf = cudnn.findConvolutionBackwardFilterAlgorithmEx_v7( + perf = cudnn.findConvolutionBackwardFilterAlgorithmEx( + handle, x_desc, x.data.ptr, dy_desc, dy.data.ptr, conv_desc, + filter_desc, dW.data.ptr, 1, workspace.ptr, max_workspace_size)[0] + ELSE: + perf = cudnn.findConvolutionBackwardFilterAlgorithmEx( handle, x_desc, x.data.ptr, dy_desc, dy.data.ptr, conv_desc, - filter_desc, dW.data.ptr, 1, workspace.ptr, - max_workspace_size)[0] - if use_tensor_core and perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH: - _warn_algorithm_bwd_filter(x, dy, dW, conv_param) - else: - perf = cudnn.findConvolutionBackwardFilterAlgorithmEx( - handle, x_desc, x.data.ptr, dy_desc, dy.data.ptr, conv_desc, - filter_desc, dW.data.ptr, 1, workspace.ptr, max_workspace_size)[0] - if perf.status != cudnn.CUDNN_STATUS_SUCCESS: - raise RuntimeError('No available algorithm found.') - algo = _Algorithm(perf.algo, perf.memory, perf.mathType) - _algorithm_bwd_filter_cache[key] = algo - return algo - - -cpdef _Algorithm _get_algorithm_bwd_filter( - _ndarray_base x, _ndarray_base dy, _ndarray_base dW, tuple conv_param, - size_t handle, size_t x_desc, size_t gy_desc, size_t conv_desc, - size_t filter_desc, size_t max_workspace_size, bint use_tensor_core, - bint deterministic): - cdef cudnn.CuDNNAlgoPerf perf - key = (x.data.device.id, x.shape, dW.shape, dy.shape, conv_param, - max_workspace_size) - algo = _get_algorithm_bwd_filter_cache.get(key, None) - if algo is not None: - return algo - cdef list ret - cdef bint skip - if cudnn_version() >= 7000: - ret = cudnn.getConvolutionBackwardFilterAlgorithm_v7( - handle, x_desc, gy_desc, conv_desc, filter_desc, 10) - skip = False - for perf in ret: - if deterministic and not perf.determinism: - continue - if perf.memory <= max_workspace_size: - break - skip = True - else: - raise RuntimeError( - 'No conv bwd filter algo available with workspace size less ' - 'equal {}'.format(max_workspace_size)) - if use_tensor_core and skip: - _warnings.warn( - 'The best algo of conv bwd filter might not not selected due ' - 'to lack of workspace size ({})'.format(max_workspace_size), - _util.PerformanceWarning) - algo = perf.algo - workspace_size = perf.memory - math_type = perf.mathType - if use_tensor_core and math_type != cudnn.CUDNN_TENSOR_OP_MATH: - _warn_algorithm_bwd_filter(x, dy, dW, conv_param) + filter_desc, dW.data.ptr, 1, workspace.ptr, max_workspace_size)[0] + if perf.status != cudnn.CUDNN_STATUS_SUCCESS: + raise RuntimeError('No available algorithm found.') algo = _Algorithm(perf.algo, perf.memory, perf.mathType) - else: - algo_no = cudnn.getConvolutionBackwardFilterAlgorithm_v6( - handle, x_desc, gy_desc, conv_desc, filter_desc, - cudnn.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, - max_workspace_size) - workspace_size = cudnn.getConvolutionBackwardFilterWorkspaceSize( - handle, x_desc, gy_desc, conv_desc, filter_desc, algo_no) - algo = _Algorithm(algo_no, workspace_size) - _get_algorithm_bwd_filter_cache[key] = algo - return algo - - -cpdef _warn_algorithm_bwd_data( - _ndarray_base W, _ndarray_base x, _ndarray_base y, tuple conv_param): - _warnings.warn( - 'Tensor Core mode is set but the selected convolution backward ' - 'data algorithm is not a Tensor Core enabled algorithm. ' - 'This might be due to lack of workspace memory. ' - 'W.shape:{}, x.shape:{}, y.shape:{}, pad:{}, stride:{}' - .format(W.shape, x.shape, y.shape, conv_param[0], conv_param[1]), - _util.PerformanceWarning) - - -cpdef _Algorithm _find_algorithm_bwd_data( - _ndarray_base W, _ndarray_base x, _ndarray_base y, tuple conv_param, - size_t handle, size_t filter_desc, size_t x_desc, size_t conv_desc, - size_t y_desc, size_t max_workspace_size, bint use_tensor_core, - bint deterministic): - cdef _Algorithm algo - cdef cudnn.CuDNNAlgoPerf perf - key = (x.data.device.id, W.shape, x.shape, y.shape, conv_param, - max_workspace_size) - algo = _algorithm_bwd_data_cache.get(key, None) - if algo is not None: + _algorithm_bwd_filter_cache[key] = algo return algo - workspace = _memory.alloc(max_workspace_size) - if cudnn_version() >= 7000: - if deterministic: - ret = cudnn.findConvolutionBackwardDataAlgorithmEx_v7( - handle, filter_desc, W.data.ptr, x_desc, x.data.ptr, conv_desc, - y_desc, y.data.ptr, 10, workspace.ptr, max_workspace_size) - for perf in ret: - if perf.determinism: - break + + + cpdef _Algorithm _get_algorithm_bwd_filter( + _ndarray_base x, _ndarray_base dy, _ndarray_base dW, tuple conv_param, + size_t handle, size_t x_desc, size_t gy_desc, size_t conv_desc, + size_t filter_desc, size_t max_workspace_size, bint use_tensor_core, + bint deterministic): + cdef cudnn.CuDNNAlgoPerf perf + key = (x.data.device.id, x.shape, dW.shape, dy.shape, conv_param, + max_workspace_size) + algo = _get_algorithm_bwd_filter_cache.get(key, None) + if algo is not None: + return algo + cdef list ret + cdef bint skip + IF CUPY_HIP_VERSION == 0: + if cudnn_version() >= 7000: + ret = cudnn.getConvolutionBackwardFilterAlgorithm_v7( + handle, x_desc, gy_desc, conv_desc, filter_desc, 10) + skip = False + for perf in ret: + if deterministic and not perf.determinism: + continue + if perf.memory <= max_workspace_size: + break + skip = True + else: + raise RuntimeError( + 'No conv bwd filter algo available with workspace size less ' + 'equal {}'.format(max_workspace_size)) + if use_tensor_core and skip: + _warnings.warn( + 'The best algo of conv bwd filter might not not selected due ' + 'to lack of workspace size ({})'.format(max_workspace_size), + _util.PerformanceWarning) + algo = perf.algo + workspace_size = perf.memory + math_type = perf.mathType + if use_tensor_core and math_type != cudnn.CUDNN_TENSOR_OP_MATH: + _warn_algorithm_bwd_filter(x, dy, dW, conv_param) + algo = _Algorithm(perf.algo, perf.memory, perf.mathType) else: - raise RuntimeError( - 'No conv bwd filter algo available with workspace size ' - 'less equal {}'.format(max_workspace_size)) - else: - perf = cudnn.findConvolutionBackwardDataAlgorithmEx_v7( + algo_no = cudnn.getConvolutionBackwardFilterAlgorithm_v6( + handle, x_desc, gy_desc, conv_desc, filter_desc, + cudnn.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + max_workspace_size) + workspace_size = cudnn.getConvolutionBackwardFilterWorkspaceSize( + handle, x_desc, gy_desc, conv_desc, filter_desc, algo_no) + algo = _Algorithm(algo_no, workspace_size) + ELSE: + algo_no = cudnn.getConvolutionBackwardFilterAlgorithm_v6( + handle, x_desc, gy_desc, conv_desc, filter_desc, + cudnn.CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, + max_workspace_size) + workspace_size = cudnn.getConvolutionBackwardFilterWorkspaceSize( + handle, x_desc, gy_desc, conv_desc, filter_desc, algo_no) + algo = _Algorithm(algo_no, workspace_size) + _get_algorithm_bwd_filter_cache[key] = algo + return algo + + + cpdef _warn_algorithm_bwd_data( + _ndarray_base W, _ndarray_base x, _ndarray_base y, tuple conv_param): + _warnings.warn( + 'Tensor Core mode is set but the selected convolution backward ' + 'data algorithm is not a Tensor Core enabled algorithm. ' + 'This might be due to lack of workspace memory. ' + 'W.shape:{}, x.shape:{}, y.shape:{}, pad:{}, stride:{}' + .format(W.shape, x.shape, y.shape, conv_param[0], conv_param[1]), + _util.PerformanceWarning) + + + cpdef _Algorithm _find_algorithm_bwd_data( + _ndarray_base W, _ndarray_base x, _ndarray_base y, tuple conv_param, + size_t handle, size_t filter_desc, size_t x_desc, size_t conv_desc, + size_t y_desc, size_t max_workspace_size, bint use_tensor_core, + bint deterministic): + cdef _Algorithm algo + cdef cudnn.CuDNNAlgoPerf perf + key = (x.data.device.id, W.shape, x.shape, y.shape, conv_param, + max_workspace_size) + algo = _algorithm_bwd_data_cache.get(key, None) + if algo is not None: + return algo + workspace = _memory.alloc(max_workspace_size) + IF CUPY_HIP_VERSION == 0: + if cudnn_version() >= 7000: + if deterministic: + ret = cudnn.findConvolutionBackwardDataAlgorithmEx_v7( + handle, filter_desc, W.data.ptr, x_desc, x.data.ptr, conv_desc, + y_desc, y.data.ptr, 10, workspace.ptr, max_workspace_size) + for perf in ret: + if perf.determinism: + break + else: + raise RuntimeError( + 'No conv bwd filter algo available with workspace size ' + 'less equal {}'.format(max_workspace_size)) + else: + perf = cudnn.findConvolutionBackwardDataAlgorithmEx_v7( + handle, filter_desc, W.data.ptr, x_desc, x.data.ptr, conv_desc, + y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0] + if use_tensor_core and perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH: + _warn_algorithm_bwd_data(W, x, y, conv_param) + else: + perf = cudnn.findConvolutionBackwardDataAlgorithmEx( + handle, filter_desc, W.data.ptr, x_desc, x.data.ptr, conv_desc, + y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0] + ELSE: + perf = cudnn.findConvolutionBackwardDataAlgorithmEx( handle, filter_desc, W.data.ptr, x_desc, x.data.ptr, conv_desc, y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0] - if use_tensor_core and perf.mathType != cudnn.CUDNN_TENSOR_OP_MATH: - _warn_algorithm_bwd_data(W, x, y, conv_param) - else: - perf = cudnn.findConvolutionBackwardDataAlgorithmEx( - handle, filter_desc, W.data.ptr, x_desc, x.data.ptr, conv_desc, - y_desc, y.data.ptr, 1, workspace.ptr, max_workspace_size)[0] - if perf.status != cudnn.CUDNN_STATUS_SUCCESS: - raise RuntimeError('No available algorithm found.') - algo = _Algorithm(perf.algo, perf.memory, perf.mathType) - _algorithm_bwd_data_cache[key] = algo - return algo - - -cpdef _Algorithm _get_algorithm_bwd_data( - _ndarray_base W, _ndarray_base x, _ndarray_base y, tuple conv_param, - size_t handle, size_t filter_desc, size_t x_desc, size_t conv_desc, - size_t y_desc, size_t max_workspace_size, bint use_tensor_core, - bint deterministic): - cdef cudnn.CuDNNAlgoPerf perf - key = (x.data.device.id, W.shape, x.shape, y.shape, conv_param, - max_workspace_size) - algo = _get_algorithm_bwd_data_cache.get(key, None) - if algo is not None: + if perf.status != cudnn.CUDNN_STATUS_SUCCESS: + raise RuntimeError('No available algorithm found.') + algo = _Algorithm(perf.algo, perf.memory, perf.mathType) + _algorithm_bwd_data_cache[key] = algo return algo - cdef list ret - cdef bint skip - if cudnn_version() >= 7000: - ret = cudnn.getConvolutionBackwardDataAlgorithm_v7( - handle, filter_desc, x_desc, conv_desc, y_desc, 10) - skip = False - for perf in ret: - if deterministic and not perf.determinism: - continue - if perf.memory <= max_workspace_size: - break - skip = True + + + cpdef _Algorithm _get_algorithm_bwd_data( + _ndarray_base W, _ndarray_base x, _ndarray_base y, tuple conv_param, + size_t handle, size_t filter_desc, size_t x_desc, size_t conv_desc, + size_t y_desc, size_t max_workspace_size, bint use_tensor_core, + bint deterministic): + cdef cudnn.CuDNNAlgoPerf perf + key = (x.data.device.id, W.shape, x.shape, y.shape, conv_param, + max_workspace_size) + algo = _get_algorithm_bwd_data_cache.get(key, None) + if algo is not None: + return algo + cdef list ret + cdef bint skip + IF CUPY_HIP_VERSION == 0: + if cudnn_version() >= 7000: + ret = cudnn.getConvolutionBackwardDataAlgorithm_v7( + handle, filter_desc, x_desc, conv_desc, y_desc, 10) + skip = False + for perf in ret: + if deterministic and not perf.determinism: + continue + if perf.memory <= max_workspace_size: + break + skip = True + else: + raise RuntimeError( + 'No conv bwd data algo available with workspace size less ' + 'equal {}'.format(max_workspace_size)) + if use_tensor_core and skip: + _warnings.warn( + 'The best algo of conv bwd data might not not selected due ' + 'to lack of workspace size ({})'.format(max_workspace_size), + _util.PerformanceWarning) + algo = perf.algo + workspace_size = perf.memory + math_type = perf.mathType + if use_tensor_core and math_type != cudnn.CUDNN_TENSOR_OP_MATH: + _warn_algorithm_bwd_data(W, x, y, conv_param) + algo = _Algorithm(perf.algo, perf.memory, perf.mathType) + else: + algo_no = cudnn.getConvolutionBackwardDataAlgorithm_v6( + handle, filter_desc, x_desc, conv_desc, y_desc, + cudnn.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + max_workspace_size) + workspace_size = cudnn.getConvolutionBackwardDataWorkspaceSize( + handle, filter_desc, x_desc, conv_desc, y_desc, algo_no) + algo = _Algorithm(algo_no, workspace_size) + ELSE: + algo_no = cudnn.getConvolutionBackwardDataAlgorithm_v6( + handle, filter_desc, x_desc, conv_desc, y_desc, + cudnn.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, + max_workspace_size) + workspace_size = cudnn.getConvolutionBackwardDataWorkspaceSize( + handle, filter_desc, x_desc, conv_desc, y_desc, algo_no) + algo = _Algorithm(algo_no, workspace_size) + _get_algorithm_bwd_data_cache[key] = algo + return algo + + + cpdef bint _should_use_tensor_core( + tensor_core_mode, object dtype) except *: + if tensor_core_mode == 'auto': + return is_tensor_core_available(dtype) + elif tensor_core_mode == 'always': + # TODO(oktua): more strict condition + return is_tensor_core_available(dtype) + elif tensor_core_mode == 'never': + return False else: - raise RuntimeError( - 'No conv bwd data algo available with workspace size less ' - 'equal {}'.format(max_workspace_size)) - if use_tensor_core and skip: - _warnings.warn( - 'The best algo of conv bwd data might not not selected due ' - 'to lack of workspace size ({})'.format(max_workspace_size), - _util.PerformanceWarning) - algo = perf.algo - workspace_size = perf.memory - math_type = perf.mathType - if use_tensor_core and math_type != cudnn.CUDNN_TENSOR_OP_MATH: - _warn_algorithm_bwd_data(W, x, y, conv_param) - algo = _Algorithm(perf.algo, perf.memory, perf.mathType) - else: - algo_no = cudnn.getConvolutionBackwardDataAlgorithm_v6( - handle, filter_desc, x_desc, conv_desc, y_desc, - cudnn.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, - max_workspace_size) - workspace_size = cudnn.getConvolutionBackwardDataWorkspaceSize( - handle, filter_desc, x_desc, conv_desc, y_desc, algo_no) - algo = _Algorithm(algo_no, workspace_size) - _get_algorithm_bwd_data_cache[key] = algo - return algo - - -cpdef bint _should_use_tensor_core( - tensor_core_mode, object dtype) except *: - if tensor_core_mode == 'auto': - return is_tensor_core_available(dtype) - elif tensor_core_mode == 'always': - # TODO(oktua): more strict condition - return is_tensor_core_available(dtype) - elif tensor_core_mode == 'never': - return False - else: - raise ValueError( - 'tensor_code_mode must be either of "always", "auto", or "never".') - - -def _get_array_info(_ndarray_base arr): - if arr is None: - return 'None' - return 'shape={!r}, dtype={}, strides={!r}'.format( - arr.shape, arr.dtype.name, arr.strides) - - -def convolution_forward( - _ndarray_base x, _ndarray_base W, _ndarray_base b, _ndarray_base y, - tuple pad, tuple stride, tuple dilation, int groups, *, - bint auto_tune, tensor_core, - int d_layout=cudnn.CUDNN_TENSOR_NCHW, - int w_layout=cudnn.CUDNN_TENSOR_NCHW): - cdef int dev_id = x.data.device.id - assert dev_id == W.data.device.id - assert dev_id == y.data.device.id - - cdef float float_zero = 0, float_one = 1 - cdef double double_zero = 0, double_one = 1 - cdef size_t zero, one - if x.dtype == 'd': - zero = &double_zero - one = &double_one - else: - zero = &float_zero - one = &float_one - - cdef bint use_tensor_core = _should_use_tensor_core(tensor_core, x.dtype) - cdef tuple conv_param = (pad, stride, x.dtype, use_tensor_core) - - # cuDNN 7 supports dilation only in *_FWD_ALGO_IMPLICIT_GEMM, but - # it supports Tensor Cores only in *_FWD_ALGO_IMPLICIT_PRECOMP_GEMM. - if use_tensor_core: - for i in dilation: - if i > 1: - use_tensor_core = False - break - - handle = get_handle() - x = core._internal_ascontiguousarray(x) - W = core._internal_ascontiguousarray(W) - - # TODO(okuta) check performance - cdef size_t x_desc = cudnn.createTensorDescriptor() - cdef size_t y_desc = cudnn.createTensorDescriptor() - cdef size_t b_desc = cudnn.createTensorDescriptor() - cdef size_t filter_desc = cudnn.createFilterDescriptor() - cdef size_t conv_desc = cudnn.createConvolutionDescriptor() - - cdef size_t max_workspace_size = get_max_workspace_size() - cdef shape_t b_shape - cdef _Algorithm perf - try: - _create_tensor_descriptor(x_desc, x, format=d_layout) - _create_tensor_descriptor(y_desc, y, format=d_layout) - _create_filter_descriptor(filter_desc, W, w_layout) - _create_convolution_descriptor( - conv_desc, pad, stride, dilation, groups, x.dtype, - cudnn.CUDNN_CROSS_CORRELATION, use_tensor_core) - - if auto_tune: - perf = _find_algorithm_fwd( - x, W, y, conv_param, handle, x_desc, filter_desc, - conv_desc, y_desc, max_workspace_size, use_tensor_core) + raise ValueError( + 'tensor_code_mode must be either of "always", "auto", or "never".') + + + def _get_array_info(_ndarray_base arr): + if arr is None: + return 'None' + return 'shape={!r}, dtype={}, strides={!r}'.format( + arr.shape, arr.dtype.name, arr.strides) + + + def convolution_forward( + _ndarray_base x, _ndarray_base W, _ndarray_base b, _ndarray_base y, + tuple pad, tuple stride, tuple dilation, int groups, *, + bint auto_tune, tensor_core, + int d_layout=cudnn.CUDNN_TENSOR_NCHW, + int w_layout=cudnn.CUDNN_TENSOR_NCHW): + cdef int dev_id = x.data.device.id + assert dev_id == W.data.device.id + assert dev_id == y.data.device.id + + cdef float float_zero = 0, float_one = 1 + cdef double double_zero = 0, double_one = 1 + cdef size_t zero, one + if x.dtype == 'd': + zero = &double_zero + one = &double_one else: - perf = _get_algorithm_fwd( - x, W, y, conv_param, handle, x_desc, filter_desc, - conv_desc, y_desc, max_workspace_size, use_tensor_core) - - if cudnn_version() >= 7000: - cudnn.setConvolutionMathType(conv_desc, perf.mathType) - - workspace = _memory.alloc(perf.memory) - + zero = &float_zero + one = &float_one + + cdef bint use_tensor_core = _should_use_tensor_core(tensor_core, x.dtype) + cdef tuple conv_param = (pad, stride, x.dtype, use_tensor_core) + + # cuDNN 7 supports dilation only in *_FWD_ALGO_IMPLICIT_GEMM, but + # it supports Tensor Cores only in *_FWD_ALGO_IMPLICIT_PRECOMP_GEMM. + if use_tensor_core: + for i in dilation: + if i > 1: + use_tensor_core = False + break + + handle = get_handle() + x = core._internal_ascontiguousarray(x) + W = core._internal_ascontiguousarray(W) + + # TODO(okuta) check performance + cdef size_t x_desc = cudnn.createTensorDescriptor() + cdef size_t y_desc = cudnn.createTensorDescriptor() + cdef size_t b_desc = cudnn.createTensorDescriptor() + cdef size_t filter_desc = cudnn.createFilterDescriptor() + cdef size_t conv_desc = cudnn.createConvolutionDescriptor() + + cdef size_t max_workspace_size = get_max_workspace_size() + cdef shape_t b_shape + cdef _Algorithm perf try: - cudnn.convolutionForward( - handle, one, x_desc, x.data.ptr, filter_desc, W.data.ptr, - conv_desc, perf.algo, workspace.ptr, perf.memory, zero, y_desc, - y.data.ptr) - except _py_cudnn.CuDNNError as e: - infos = [ - 'func: cudnnConvolutionForward', - 'x: {}'.format(_get_array_info(x)), - 'W: {}'.format(_get_array_info(W)), - 'b: {}'.format(_get_array_info(b)), - 'y: {}'.format(_get_array_info(y)), - 'pad={!r}, stride={!r}, dilation={!r}, groups={!r}'.format( - pad, stride, dilation, groups), - 'auto_tune={!r}, tensor_core={!r}'.format( - auto_tune, tensor_core), - 'd_layout={!r}, w_layout={!r}'.format(d_layout, w_layout), - ] - e.add_infos(infos) - raise - - del workspace, x, W - - if b is not None: - assert dev_id == b.data.device.id - b_shape.assign(y._shape.size(), 1) - b_shape[1] = -1 - b = _manipulation._reshape( - core._internal_ascontiguousarray(b), b_shape) - _create_tensor_nd_descriptor(b_desc, b, -1) - cudnn.addTensor_v3(handle, one, b_desc, - b.data.ptr, one, y_desc, y.data.ptr) - finally: - cudnn.destroyTensorDescriptor(x_desc) - cudnn.destroyTensorDescriptor(y_desc) - cudnn.destroyTensorDescriptor(b_desc) - cudnn.destroyFilterDescriptor(filter_desc) - cudnn.destroyConvolutionDescriptor(conv_desc) - - -def convolution_backward_filter( - _ndarray_base x, _ndarray_base gy, _ndarray_base gW, - tuple pad, tuple stride, tuple dilation, int groups, *, - bint deterministic, bint auto_tune, tensor_core, - int d_layout=cudnn.CUDNN_TENSOR_NCHW, - int w_layout=cudnn.CUDNN_TENSOR_NCHW): - cdef int dev_id = x.data.device.id - assert dev_id == gy.data.device.id - assert dev_id == gW.data.device.id - - cdef float float_zero = 0, float_one = 1 - cdef double double_zero = 0, double_one = 1 - cdef size_t zero, one - if x.dtype == 'd': - zero = &double_zero - one = &double_one - else: - zero = &float_zero - one = &float_one - - # Disable use_tensor_core in deterministic mode because - # CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 does not use Tensor Core. - cdef bint use_tensor_core = ( - not deterministic and _should_use_tensor_core(tensor_core, x.dtype)) - cdef tuple conv_param = ( - pad, stride, x.dtype, use_tensor_core, deterministic) - - handle = get_handle() - x = core._internal_ascontiguousarray(x) - gy = core._internal_ascontiguousarray(gy) - - # TODO(okuta) check performance - cdef size_t x_desc = cudnn.createTensorDescriptor() - cdef size_t gy_desc = cudnn.createTensorDescriptor() - cdef size_t filter_desc = cudnn.createFilterDescriptor() - cdef size_t conv_desc = cudnn.createConvolutionDescriptor() - - cdef _Algorithm perf - cdef int algo - cdef size_t max_workspace_size = get_max_workspace_size() - cdef size_t workspace_size = 0 - try: - _create_tensor_descriptor(x_desc, x, format=d_layout) - _create_tensor_descriptor(gy_desc, gy, format=d_layout) - _create_filter_descriptor(filter_desc, gW, w_layout) - _create_convolution_descriptor( - conv_desc, pad, stride, dilation, groups, x.dtype, - cudnn.CUDNN_CROSS_CORRELATION, use_tensor_core) - - if deterministic and cudnn_version() < 7000: - # TODO(imanishi): Support Tensor Core in deterministic mode. - algo = cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 - workspace_size = cudnn.getConvolutionBackwardFilterWorkspaceSize( - handle, x_desc, gy_desc, conv_desc, filter_desc, algo) - math_type = cudnn.CUDNN_DEFAULT_MATH - if workspace_size > max_workspace_size: - raise RuntimeError( - 'No conv bwd filter algo available with workspace size ' - 'less equal {}'.format(max_workspace_size)) + _create_tensor_descriptor(x_desc, x, format=d_layout) + _create_tensor_descriptor(y_desc, y, format=d_layout) + _create_filter_descriptor(filter_desc, W, w_layout) + _create_convolution_descriptor( + conv_desc, pad, stride, dilation, groups, x.dtype, + cudnn.CUDNN_CROSS_CORRELATION, use_tensor_core) + + if auto_tune: + perf = _find_algorithm_fwd( + x, W, y, conv_param, handle, x_desc, filter_desc, + conv_desc, y_desc, max_workspace_size, use_tensor_core) + else: + perf = _get_algorithm_fwd( + x, W, y, conv_param, handle, x_desc, filter_desc, + conv_desc, y_desc, max_workspace_size, use_tensor_core) + + if cudnn_version() >= 7000: + cudnn.setConvolutionMathType(conv_desc, perf.mathType) + + workspace = _memory.alloc(perf.memory) + + try: + cudnn.convolutionForward( + handle, one, x_desc, x.data.ptr, filter_desc, W.data.ptr, + conv_desc, perf.algo, workspace.ptr, perf.memory, zero, y_desc, + y.data.ptr) + except _py_cudnn.CuDNNError as e: + infos = [ + 'func: cudnnConvolutionForward', + 'x: {}'.format(_get_array_info(x)), + 'W: {}'.format(_get_array_info(W)), + 'b: {}'.format(_get_array_info(b)), + 'y: {}'.format(_get_array_info(y)), + 'pad={!r}, stride={!r}, dilation={!r}, groups={!r}'.format( + pad, stride, dilation, groups), + 'auto_tune={!r}, tensor_core={!r}'.format( + auto_tune, tensor_core), + 'd_layout={!r}, w_layout={!r}'.format(d_layout, w_layout), + ] + e.add_infos(infos) + raise + + del workspace, x, W + + if b is not None: + assert dev_id == b.data.device.id + b_shape.assign(y._shape.size(), 1) + b_shape[1] = -1 + b = _manipulation._reshape( + core._internal_ascontiguousarray(b), b_shape) + _create_tensor_nd_descriptor(b_desc, b, -1) + cudnn.addTensor_v3(handle, one, b_desc, + b.data.ptr, one, y_desc, y.data.ptr) + finally: + cudnn.destroyTensorDescriptor(x_desc) + cudnn.destroyTensorDescriptor(y_desc) + cudnn.destroyTensorDescriptor(b_desc) + cudnn.destroyFilterDescriptor(filter_desc) + cudnn.destroyConvolutionDescriptor(conv_desc) + + + def convolution_backward_filter( + _ndarray_base x, _ndarray_base gy, _ndarray_base gW, + tuple pad, tuple stride, tuple dilation, int groups, *, + bint deterministic, bint auto_tune, tensor_core, + int d_layout=cudnn.CUDNN_TENSOR_NCHW, + int w_layout=cudnn.CUDNN_TENSOR_NCHW): + cdef int dev_id = x.data.device.id + assert dev_id == gy.data.device.id + assert dev_id == gW.data.device.id + + cdef float float_zero = 0, float_one = 1 + cdef double double_zero = 0, double_one = 1 + cdef size_t zero, one + if x.dtype == 'd': + zero = &double_zero + one = &double_one else: - if auto_tune and not deterministic: - perf = _find_algorithm_bwd_filter( - x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, - filter_desc, max_workspace_size, use_tensor_core, - deterministic) + zero = &float_zero + one = &float_one + + # Disable use_tensor_core in deterministic mode because + # CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 does not use Tensor Core. + cdef bint use_tensor_core = ( + not deterministic and _should_use_tensor_core(tensor_core, x.dtype)) + cdef tuple conv_param = ( + pad, stride, x.dtype, use_tensor_core, deterministic) + + handle = get_handle() + x = core._internal_ascontiguousarray(x) + gy = core._internal_ascontiguousarray(gy) + + # TODO(okuta) check performance + cdef size_t x_desc = cudnn.createTensorDescriptor() + cdef size_t gy_desc = cudnn.createTensorDescriptor() + cdef size_t filter_desc = cudnn.createFilterDescriptor() + cdef size_t conv_desc = cudnn.createConvolutionDescriptor() + + cdef _Algorithm perf + cdef int algo + cdef size_t max_workspace_size = get_max_workspace_size() + cdef size_t workspace_size = 0 + try: + _create_tensor_descriptor(x_desc, x, format=d_layout) + _create_tensor_descriptor(gy_desc, gy, format=d_layout) + _create_filter_descriptor(filter_desc, gW, w_layout) + _create_convolution_descriptor( + conv_desc, pad, stride, dilation, groups, x.dtype, + cudnn.CUDNN_CROSS_CORRELATION, use_tensor_core) + + if deterministic and cudnn_version() < 7000: + # TODO(imanishi): Support Tensor Core in deterministic mode. + algo = cudnn.CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1 + workspace_size = cudnn.getConvolutionBackwardFilterWorkspaceSize( + handle, x_desc, gy_desc, conv_desc, filter_desc, algo) + math_type = cudnn.CUDNN_DEFAULT_MATH + if workspace_size > max_workspace_size: + raise RuntimeError( + 'No conv bwd filter algo available with workspace size ' + 'less equal {}'.format(max_workspace_size)) else: - perf = _get_algorithm_bwd_filter( - x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, - filter_desc, max_workspace_size, use_tensor_core, - deterministic) - algo = perf.algo - workspace_size = perf.memory - math_type = perf.mathType - - if cudnn_version() >= 7000: - cudnn.setConvolutionMathType(conv_desc, math_type) - - workspace = _memory.alloc(workspace_size) - - cudnn.convolutionBackwardFilter_v3( - handle, one, x_desc, x.data.ptr, gy_desc, - gy.data.ptr, conv_desc, algo, workspace.ptr, - workspace_size, zero, filter_desc, gW.data.ptr) - finally: - cudnn.destroyTensorDescriptor(x_desc) - cudnn.destroyTensorDescriptor(gy_desc) - cudnn.destroyFilterDescriptor(filter_desc) - cudnn.destroyConvolutionDescriptor(conv_desc) - - -def convolution_backward_data( - _ndarray_base W, _ndarray_base x, _ndarray_base b, _ndarray_base y, - tuple pad, tuple stride, tuple dilation, int groups, *, - bint deterministic, bint auto_tune, tensor_core, - int d_layout=cudnn.CUDNN_TENSOR_NCHW, - int w_layout=cudnn.CUDNN_TENSOR_NCHW): - cdef int dev_id = W.data.device.id - assert dev_id == x.data.device.id - assert dev_id == y.data.device.id - - cdef float float_zero = 0, float_one = 1 - cdef double double_zero = 0, double_one = 1 - cdef size_t zero, one - if x.dtype == 'd': - zero = &double_zero - one = &double_one - else: - zero = &float_zero - one = &float_one - - # Disable use_tensor_core in deterministic mode because - # CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 does not use Tensor Core. - cdef bint use_tensor_core = ( - not deterministic and _should_use_tensor_core(tensor_core, x.dtype)) - cdef tuple conv_param = ( - pad, stride, x.dtype, use_tensor_core, deterministic) - - # cuDNN 7 supports dilation only in *_FWD_ALGO_IMPLICIT_GEMM, but - # it supports Tensor Cores only in *_FWD_ALGO_IMPLICIT_PRECOMP_GEMM. - if use_tensor_core: - for i in dilation: - if i > 1: - use_tensor_core = False - break - - handle = get_handle() - x = core._internal_ascontiguousarray(x) - W = core._internal_ascontiguousarray(W) - - # TODO(okuta) check performance - cdef size_t x_desc = cudnn.createTensorDescriptor() - cdef size_t y_desc = cudnn.createTensorDescriptor() - cdef size_t b_desc = cudnn.createTensorDescriptor() - cdef size_t filter_desc = cudnn.createFilterDescriptor() - cdef size_t conv_desc = cudnn.createConvolutionDescriptor() - - cdef _Algorithm perf - cdef int algo - cdef size_t max_workspace_size = get_max_workspace_size() - cdef size_t workspace_size = 0 - cdef shape_t b_shape - try: - _create_tensor_descriptor(x_desc, x, format=d_layout) - _create_tensor_descriptor(y_desc, y, format=d_layout) - _create_filter_descriptor(filter_desc, W, w_layout) - _create_convolution_descriptor( - conv_desc, pad, stride, dilation, groups, x.dtype, - cudnn.CUDNN_CROSS_CORRELATION, use_tensor_core) - - if deterministic and cudnn_version() < 7000: - # TODO(imanishi): Support Tensor Core in deterministic mode. - algo = cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 - workspace_size = cudnn.getConvolutionBackwardDataWorkspaceSize( - handle, filter_desc, x_desc, conv_desc, y_desc, algo) - math_type = cudnn.CUDNN_DEFAULT_MATH - if workspace_size > max_workspace_size: - raise RuntimeError( - 'No conv bwd data algo available with workspace size less ' - 'equal {}'.format(max_workspace_size)) + if auto_tune and not deterministic: + perf = _find_algorithm_bwd_filter( + x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, + filter_desc, max_workspace_size, use_tensor_core, + deterministic) + else: + perf = _get_algorithm_bwd_filter( + x, gy, gW, conv_param, handle, x_desc, gy_desc, conv_desc, + filter_desc, max_workspace_size, use_tensor_core, + deterministic) + algo = perf.algo + workspace_size = perf.memory + math_type = perf.mathType + + if cudnn_version() >= 7000: + cudnn.setConvolutionMathType(conv_desc, math_type) + + workspace = _memory.alloc(workspace_size) + + cudnn.convolutionBackwardFilter_v3( + handle, one, x_desc, x.data.ptr, gy_desc, + gy.data.ptr, conv_desc, algo, workspace.ptr, + workspace_size, zero, filter_desc, gW.data.ptr) + finally: + cudnn.destroyTensorDescriptor(x_desc) + cudnn.destroyTensorDescriptor(gy_desc) + cudnn.destroyFilterDescriptor(filter_desc) + cudnn.destroyConvolutionDescriptor(conv_desc) + + + def convolution_backward_data( + _ndarray_base W, _ndarray_base x, _ndarray_base b, _ndarray_base y, + tuple pad, tuple stride, tuple dilation, int groups, *, + bint deterministic, bint auto_tune, tensor_core, + int d_layout=cudnn.CUDNN_TENSOR_NCHW, + int w_layout=cudnn.CUDNN_TENSOR_NCHW): + cdef int dev_id = W.data.device.id + assert dev_id == x.data.device.id + assert dev_id == y.data.device.id + + cdef float float_zero = 0, float_one = 1 + cdef double double_zero = 0, double_one = 1 + cdef size_t zero, one + if x.dtype == 'd': + zero = &double_zero + one = &double_one else: - if auto_tune and not deterministic: - perf = _find_algorithm_bwd_data( - W, x, y, conv_param, handle, filter_desc, x_desc, - conv_desc, y_desc, max_workspace_size, use_tensor_core, - deterministic) + zero = &float_zero + one = &float_one + + # Disable use_tensor_core in deterministic mode because + # CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 does not use Tensor Core. + cdef bint use_tensor_core = ( + not deterministic and _should_use_tensor_core(tensor_core, x.dtype)) + cdef tuple conv_param = ( + pad, stride, x.dtype, use_tensor_core, deterministic) + + # cuDNN 7 supports dilation only in *_FWD_ALGO_IMPLICIT_GEMM, but + # it supports Tensor Cores only in *_FWD_ALGO_IMPLICIT_PRECOMP_GEMM. + if use_tensor_core: + for i in dilation: + if i > 1: + use_tensor_core = False + break + + handle = get_handle() + x = core._internal_ascontiguousarray(x) + W = core._internal_ascontiguousarray(W) + + # TODO(okuta) check performance + cdef size_t x_desc = cudnn.createTensorDescriptor() + cdef size_t y_desc = cudnn.createTensorDescriptor() + cdef size_t b_desc = cudnn.createTensorDescriptor() + cdef size_t filter_desc = cudnn.createFilterDescriptor() + cdef size_t conv_desc = cudnn.createConvolutionDescriptor() + + cdef _Algorithm perf + cdef int algo + cdef size_t max_workspace_size = get_max_workspace_size() + cdef size_t workspace_size = 0 + cdef shape_t b_shape + try: + _create_tensor_descriptor(x_desc, x, format=d_layout) + _create_tensor_descriptor(y_desc, y, format=d_layout) + _create_filter_descriptor(filter_desc, W, w_layout) + _create_convolution_descriptor( + conv_desc, pad, stride, dilation, groups, x.dtype, + cudnn.CUDNN_CROSS_CORRELATION, use_tensor_core) + + if deterministic and cudnn_version() < 7000: + # TODO(imanishi): Support Tensor Core in deterministic mode. + algo = cudnn.CUDNN_CONVOLUTION_BWD_DATA_ALGO_1 + workspace_size = cudnn.getConvolutionBackwardDataWorkspaceSize( + handle, filter_desc, x_desc, conv_desc, y_desc, algo) + math_type = cudnn.CUDNN_DEFAULT_MATH + if workspace_size > max_workspace_size: + raise RuntimeError( + 'No conv bwd data algo available with workspace size less ' + 'equal {}'.format(max_workspace_size)) else: - perf = _get_algorithm_bwd_data( - W, x, y, conv_param, handle, filter_desc, x_desc, - conv_desc, y_desc, max_workspace_size, use_tensor_core, - deterministic) - algo = perf.algo - workspace_size = perf.memory - math_type = perf.mathType - - if cudnn_version() >= 7000: - cudnn.setConvolutionMathType(conv_desc, math_type) - - workspace = _memory.alloc(workspace_size) - - cudnn.convolutionBackwardData_v3( - handle, one, filter_desc, W.data.ptr, x_desc, x.data.ptr, - conv_desc, algo, workspace.ptr, workspace_size, zero, y_desc, - y.data.ptr) - - del workspace, x, W - - if b is not None: - assert dev_id == b.data.device.id - b_shape.assign(y._shape.size(), 1) - b_shape[1] = -1 - b = _manipulation._reshape( - core._internal_ascontiguousarray(b), b_shape) - _create_tensor_nd_descriptor(b_desc, b, -1) - cudnn.addTensor_v3(handle, one, b_desc, b.data.ptr, one, y_desc, - y.data.ptr) - finally: - cudnn.destroyTensorDescriptor(x_desc) - cudnn.destroyTensorDescriptor(y_desc) - cudnn.destroyTensorDescriptor(b_desc) - cudnn.destroyFilterDescriptor(filter_desc) - cudnn.destroyConvolutionDescriptor(conv_desc) - - -def pooling_forward( - _ndarray_base x, _ndarray_base y, - tuple ksize, tuple stride, tuple pad, int mode): - cdef float float_zero = 0, float_one = 1 - cdef double double_zero = 0, double_one = 1 - cdef size_t zero, one - if x.dtype == 'd': - zero = &double_zero - one = &double_one - else: - zero = &float_zero - one = &float_one - x = core._internal_ascontiguousarray(x) - if not y._c_contiguous: - raise ValueError('pooling_forward supports c-contiguous y only') - handle = get_handle() - x_desc = cudnn.createTensorDescriptor() - y_desc = cudnn.createTensorDescriptor() - pool_desc = cudnn.createPoolingDescriptor() - try: - _create_tensor_nd_descriptor(x_desc, x) - _create_tensor_nd_descriptor(y_desc, y) - _create_pooling_descriptor(pool_desc, ksize, stride, pad, mode) - cudnn.poolingForward( - handle, pool_desc, one, x_desc, - x.data.ptr, zero, y_desc, y.data.ptr) - finally: - cudnn.destroyTensorDescriptor(x_desc) - cudnn.destroyTensorDescriptor(y_desc) - cudnn.destroyPoolingDescriptor(pool_desc) - return y - - -def pooling_backward( - _ndarray_base x, _ndarray_base y, _ndarray_base gy, - tuple ksize, tuple stride, tuple pad, int mode): - cdef float float_zero = 0, float_one = 1 - cdef double double_zero = 0, double_one = 1 - cdef size_t zero, one - cdef _ndarray_base gx - if x.dtype == 'd': - zero = &double_zero - one = &double_one - else: - zero = &float_zero - one = &float_one - - gx = _core.ndarray(x._shape, x.dtype) - x = core._internal_ascontiguousarray(x) - y = core._internal_ascontiguousarray(y) - gy = core._internal_ascontiguousarray(gy) - - handle = get_handle() - x_desc = cudnn.createTensorDescriptor() - y_desc = cudnn.createTensorDescriptor() - pool_desc = cudnn.createPoolingDescriptor() - try: - _create_tensor_nd_descriptor(x_desc, x) - _create_tensor_nd_descriptor(y_desc, y) - _create_pooling_descriptor(pool_desc, ksize, stride, pad, mode) - cudnn.poolingBackward( - handle, pool_desc, - one, y_desc, y.data.ptr, y_desc, gy.data.ptr, - x_desc, x.data.ptr, zero, x_desc, gx.data.ptr) - finally: - cudnn.destroyTensorDescriptor(x_desc) - cudnn.destroyTensorDescriptor(y_desc) - cudnn.destroyPoolingDescriptor(pool_desc) - return gx - - -cdef _create_tensor_descriptor_for_bn( - size_t desc, _ndarray_base arr, bint is_for_conv2d, - int format=cudnn.CUDNN_TENSOR_NCHW): - assert arr._c_contiguous - if is_for_conv2d: - _create_tensor_descriptor(desc, arr, format) - return - data_type = get_data_type(arr.dtype) - cdef Py_ssize_t dim1, dim2 - cdef int ndim = arr._shape.size() - dim2 = 1 - if ndim > 0: - dim2 = arr._shape[ndim - 1] - dim1 = arr.size // dim2 - cudnn.setTensor4dDescriptor(desc, cudnn.CUDNN_TENSOR_NCHW, data_type, - dim1, dim2, 1, 1) - - -cdef _get_dtype_of_tensor_descriptor(size_t desc): - cudnn_dtype, _, _, _, _, _, _, _, _ = cudnn.getTensor4dDescriptor(desc) - if cudnn_dtype == cudnn.CUDNN_DATA_DOUBLE: - return _numpy.dtype(_numpy.float64) - elif cudnn_dtype == cudnn.CUDNN_DATA_FLOAT: - return _numpy.dtype(_numpy.float32) - elif cudnn_dtype == cudnn.CUDNN_DATA_HALF: - return _numpy.dtype(_numpy.float16) - else: - raise RuntimeError('Unknown cudnn data type {} '.format(cudnn_dtype)) - - -def batch_normalization_forward_training( - _ndarray_base x, _ndarray_base gamma, _ndarray_base beta, - _ndarray_base running_mean, _ndarray_base running_var, - mean, inv_std, double eps, double decay, - bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): - - reserve_space, y, save_mean, save_inv_std = ( - _batch_normalization_forward_training( - x, gamma, beta, - running_mean, running_var, - mean, inv_std, - eps, decay, - is_for_conv2d, - cudnn_mode, - debug, - d_layout)) - if reserve_space is not None: - _warnings.warn( - 'Could be faster by calling ' - 'batch_normalization_forward_training_ex() instead of ' - 'batch_normalization_forward_training().', - _util.PerformanceWarning) - if mean is None: - return y, save_mean, save_inv_std - else: + if auto_tune and not deterministic: + perf = _find_algorithm_bwd_data( + W, x, y, conv_param, handle, filter_desc, x_desc, + conv_desc, y_desc, max_workspace_size, use_tensor_core, + deterministic) + else: + perf = _get_algorithm_bwd_data( + W, x, y, conv_param, handle, filter_desc, x_desc, + conv_desc, y_desc, max_workspace_size, use_tensor_core, + deterministic) + algo = perf.algo + workspace_size = perf.memory + math_type = perf.mathType + + if cudnn_version() >= 7000: + cudnn.setConvolutionMathType(conv_desc, math_type) + + workspace = _memory.alloc(workspace_size) + + cudnn.convolutionBackwardData_v3( + handle, one, filter_desc, W.data.ptr, x_desc, x.data.ptr, + conv_desc, algo, workspace.ptr, workspace_size, zero, y_desc, + y.data.ptr) + + del workspace, x, W + + if b is not None: + assert dev_id == b.data.device.id + b_shape.assign(y._shape.size(), 1) + b_shape[1] = -1 + b = _manipulation._reshape( + core._internal_ascontiguousarray(b), b_shape) + _create_tensor_nd_descriptor(b_desc, b, -1) + cudnn.addTensor_v3(handle, one, b_desc, b.data.ptr, one, y_desc, + y.data.ptr) + finally: + cudnn.destroyTensorDescriptor(x_desc) + cudnn.destroyTensorDescriptor(y_desc) + cudnn.destroyTensorDescriptor(b_desc) + cudnn.destroyFilterDescriptor(filter_desc) + cudnn.destroyConvolutionDescriptor(conv_desc) + + + def pooling_forward( + _ndarray_base x, _ndarray_base y, + tuple ksize, tuple stride, tuple pad, int mode): + cdef float float_zero = 0, float_one = 1 + cdef double double_zero = 0, double_one = 1 + cdef size_t zero, one + if x.dtype == 'd': + zero = &double_zero + one = &double_one + else: + zero = &float_zero + one = &float_one + x = core._internal_ascontiguousarray(x) + if not y._c_contiguous: + raise ValueError('pooling_forward supports c-contiguous y only') + handle = get_handle() + x_desc = cudnn.createTensorDescriptor() + y_desc = cudnn.createTensorDescriptor() + pool_desc = cudnn.createPoolingDescriptor() + try: + _create_tensor_nd_descriptor(x_desc, x) + _create_tensor_nd_descriptor(y_desc, y) + _create_pooling_descriptor(pool_desc, ksize, stride, pad, mode) + cudnn.poolingForward( + handle, pool_desc, one, x_desc, + x.data.ptr, zero, y_desc, y.data.ptr) + finally: + cudnn.destroyTensorDescriptor(x_desc) + cudnn.destroyTensorDescriptor(y_desc) + cudnn.destroyPoolingDescriptor(pool_desc) return y - - -def batch_normalization_forward_training_ex( - _ndarray_base x, _ndarray_base gamma, _ndarray_base beta, - _ndarray_base running_mean, _ndarray_base running_var, - mean, inv_std, double eps, double decay, - bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): - - reserve_space, y, save_mean, save_inv_std = ( - _batch_normalization_forward_training( - x, gamma, beta, - running_mean, running_var, - mean, inv_std, - eps, decay, - is_for_conv2d, - cudnn_mode, - debug, - d_layout)) - if mean is None: - return reserve_space, y, save_mean, save_inv_std - else: - return reserve_space, y - - -cdef _batch_normalization_forward_training( - _ndarray_base x, _ndarray_base gamma, _ndarray_base beta, - _ndarray_base running_mean, _ndarray_base running_var, - mean, inv_std, double eps, double decay, - bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): - - cdef _memory.MemoryPointer workspace = None - cdef _memory.MemoryPointer reserve_space = None - - # Usually supply None to mean and inv_std, which are left for backward - # compatibility. See cupy#2060 and cupy#2070. - if (mean is None) != (inv_std is None): - raise ValueError('Both mean and inv_std must be None if one is.') - - x = core._internal_ascontiguousarray(x) - dtype = x.dtype - y = _core.ndarray(x._shape, dtype) - - cdef float float_one = 1 - cdef double double_zero = 0, double_one = 1 - cdef size_t zero = &double_zero, one - if x.dtype == 'd': - one = &double_one - else: - one = &float_one - - handle = get_handle() - cdef size_t x_desc = cudnn.createTensorDescriptor() - cdef size_t derivedBnDesc = cudnn.createTensorDescriptor() - try: - _create_tensor_descriptor_for_bn(x_desc, x, is_for_conv2d, - format=d_layout) - cudnn.deriveBNTensorDescriptor(derivedBnDesc, x_desc, cudnn_mode) - dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc) - if gamma.dtype != dtype_param: - gamma = gamma.astype(dtype_param) - beta = beta.astype(dtype_param) - running_mean_tmp = running_mean.astype(dtype_param) - running_var_tmp = running_var.astype(dtype_param) + + + def pooling_backward( + _ndarray_base x, _ndarray_base y, _ndarray_base gy, + tuple ksize, tuple stride, tuple pad, int mode): + cdef float float_zero = 0, float_one = 1 + cdef double double_zero = 0, double_one = 1 + cdef size_t zero, one + cdef _ndarray_base gx + if x.dtype == 'd': + zero = &double_zero + one = &double_one + else: + zero = &float_zero + one = &float_one + + gx = _core.ndarray(x._shape, x.dtype) + x = core._internal_ascontiguousarray(x) + y = core._internal_ascontiguousarray(y) + gy = core._internal_ascontiguousarray(gy) + + handle = get_handle() + x_desc = cudnn.createTensorDescriptor() + y_desc = cudnn.createTensorDescriptor() + pool_desc = cudnn.createPoolingDescriptor() + try: + _create_tensor_nd_descriptor(x_desc, x) + _create_tensor_nd_descriptor(y_desc, y) + _create_pooling_descriptor(pool_desc, ksize, stride, pad, mode) + cudnn.poolingBackward( + handle, pool_desc, + one, y_desc, y.data.ptr, y_desc, gy.data.ptr, + x_desc, x.data.ptr, zero, x_desc, gx.data.ptr) + finally: + cudnn.destroyTensorDescriptor(x_desc) + cudnn.destroyTensorDescriptor(y_desc) + cudnn.destroyPoolingDescriptor(pool_desc) + return gx + + + cdef _create_tensor_descriptor_for_bn( + size_t desc, _ndarray_base arr, bint is_for_conv2d, + int format=cudnn.CUDNN_TENSOR_NCHW): + assert arr._c_contiguous + if is_for_conv2d: + _create_tensor_descriptor(desc, arr, format) + return + data_type = get_data_type(arr.dtype) + cdef Py_ssize_t dim1, dim2 + cdef int ndim = arr._shape.size() + dim2 = 1 + if ndim > 0: + dim2 = arr._shape[ndim - 1] + dim1 = arr.size // dim2 + cudnn.setTensor4dDescriptor(desc, cudnn.CUDNN_TENSOR_NCHW, data_type, + dim1, dim2, 1, 1) + + + cdef _get_dtype_of_tensor_descriptor(size_t desc): + cudnn_dtype, _, _, _, _, _, _, _, _ = cudnn.getTensor4dDescriptor(desc) + if cudnn_dtype == cudnn.CUDNN_DATA_DOUBLE: + return _numpy.dtype(_numpy.float64) + elif cudnn_dtype == cudnn.CUDNN_DATA_FLOAT: + return _numpy.dtype(_numpy.float32) + elif cudnn_dtype == cudnn.CUDNN_DATA_HALF: + return _numpy.dtype(_numpy.float16) + else: + raise RuntimeError('Unknown cudnn data type {} '.format(cudnn_dtype)) + + + def batch_normalization_forward_training( + _ndarray_base x, _ndarray_base gamma, _ndarray_base beta, + _ndarray_base running_mean, _ndarray_base running_var, + mean, inv_std, double eps, double decay, + bint is_for_conv2d, int cudnn_mode, bint debug, + int d_layout=cudnn.CUDNN_TENSOR_NCHW): + + reserve_space, y, save_mean, save_inv_std = ( + _batch_normalization_forward_training( + x, gamma, beta, + running_mean, running_var, + mean, inv_std, + eps, decay, + is_for_conv2d, + cudnn_mode, + debug, + d_layout)) + if reserve_space is not None: + _warnings.warn( + 'Could be faster by calling ' + 'batch_normalization_forward_training_ex() instead of ' + 'batch_normalization_forward_training().', + _util.PerformanceWarning) + if mean is None: + return y, save_mean, save_inv_std else: - running_mean_tmp = running_mean - running_var_tmp = running_var - gamma = core._internal_ascontiguousarray(gamma) - beta = core._internal_ascontiguousarray(beta) + return y + + + def batch_normalization_forward_training_ex( + _ndarray_base x, _ndarray_base gamma, _ndarray_base beta, + _ndarray_base running_mean, _ndarray_base running_var, + mean, inv_std, double eps, double decay, + bint is_for_conv2d, int cudnn_mode, bint debug, + int d_layout=cudnn.CUDNN_TENSOR_NCHW): + + reserve_space, y, save_mean, save_inv_std = ( + _batch_normalization_forward_training( + x, gamma, beta, + running_mean, running_var, + mean, inv_std, + eps, decay, + is_for_conv2d, + cudnn_mode, + debug, + d_layout)) if mean is None: - save_mean = _core.ndarray(gamma.shape, dtype_param) - save_inv_std = _core.ndarray(gamma.shape, dtype_param) + return reserve_space, y, save_mean, save_inv_std + else: + return reserve_space, y + + + cdef _batch_normalization_forward_training( + _ndarray_base x, _ndarray_base gamma, _ndarray_base beta, + _ndarray_base running_mean, _ndarray_base running_var, + mean, inv_std, double eps, double decay, + bint is_for_conv2d, int cudnn_mode, bint debug, + int d_layout=cudnn.CUDNN_TENSOR_NCHW): + + cdef _memory.MemoryPointer workspace = None + cdef _memory.MemoryPointer reserve_space = None + + # Usually supply None to mean and inv_std, which are left for backward + # compatibility. See cupy#2060 and cupy#2070. + if (mean is None) != (inv_std is None): + raise ValueError('Both mean and inv_std must be None if one is.') + + x = core._internal_ascontiguousarray(x) + dtype = x.dtype + y = _core.ndarray(x._shape, dtype) + + cdef float float_one = 1 + cdef double double_zero = 0, double_one = 1 + cdef size_t zero = &double_zero, one + if x.dtype == 'd': + one = &double_one else: - save_mean = mean - save_inv_std = inv_std - - # Factor used in the moving average - factor = 1.0 - decay - - # Note: cuDNN computes the mini-batch mean and variance - # internally. We can simply (optionally) pass - # it the running-average mean and variance arrays. - # Note: This API seems to set the inverse of the standard deviation - # (instead of variance) to resultSaveInvVariance argument. The - # current implementation of our BN depends on this behavior so that - # we can reduce the number of reduction kernels. - - if cudnn_version() >= 7401: - - bn_ops = cudnn.CUDNN_BATCHNORM_OPS_BN - - if ( - cudnn_mode == cudnn.CUDNN_BATCHNORM_SPATIAL_PERSISTENT - and x.dtype == _numpy.float16 - and d_layout == cudnn.CUDNN_TENSOR_NHWC - and x.shape[3] % 4 == 0 # C mod 4 == 0 - ): - - # Faster NHWC kernel can be triggered by allocating extra - # spaces. - # https://docs.nvidia.com/deeplearning/sdk/cudnn-archived/cudnn_741/cudnn-developer-guide/index.html#cudnnBatchNormalizationForwardTrainingEx # NOQA + one = &float_one + + handle = get_handle() + cdef size_t x_desc = cudnn.createTensorDescriptor() + cdef size_t derivedBnDesc = cudnn.createTensorDescriptor() + try: + _create_tensor_descriptor_for_bn(x_desc, x, is_for_conv2d, + format=d_layout) + cudnn.deriveBNTensorDescriptor(derivedBnDesc, x_desc, cudnn_mode) + dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc) + if gamma.dtype != dtype_param: + gamma = gamma.astype(dtype_param) + beta = beta.astype(dtype_param) + running_mean_tmp = running_mean.astype(dtype_param) + running_var_tmp = running_var.astype(dtype_param) + else: + running_mean_tmp = running_mean + running_var_tmp = running_var + gamma = core._internal_ascontiguousarray(gamma) + beta = core._internal_ascontiguousarray(beta) + if mean is None: + save_mean = _core.ndarray(gamma.shape, dtype_param) + save_inv_std = _core.ndarray(gamma.shape, dtype_param) + else: + save_mean = mean + save_inv_std = inv_std + + # Factor used in the moving average + factor = 1.0 - decay + + # Note: cuDNN computes the mini-batch mean and variance + # internally. We can simply (optionally) pass + # it the running-average mean and variance arrays. + # Note: This API seems to set the inverse of the standard deviation + # (instead of variance) to resultSaveInvVariance argument. The + # current implementation of our BN depends on this behavior so that + # we can reduce the number of reduction kernels. + + if cudnn_version() >= 7401: + + bn_ops = cudnn.CUDNN_BATCHNORM_OPS_BN + + if ( + cudnn_mode == cudnn.CUDNN_BATCHNORM_SPATIAL_PERSISTENT + and x.dtype == _numpy.float16 + and d_layout == cudnn.CUDNN_TENSOR_NHWC + and x.shape[3] % 4 == 0 # C mod 4 == 0 + ): + + # Faster NHWC kernel can be triggered by allocating extra + # spaces. + # https://docs.nvidia.com/deeplearning/sdk/cudnn-archived/cudnn_741/cudnn-developer-guide/index.html#cudnnBatchNormalizationForwardTrainingEx # NOQA + workspace_size = ( + cudnn.getBatchNormalizationForwardTrainingExWorkspaceSize( + handle, + cudnn_mode, + bn_ops, + x_desc, # x + x_desc, # z + x_desc, # y + derivedBnDesc, + 0, # activation desc + )) + workspace = _memory.alloc(workspace_size) + + reserve_space_size = ( + cudnn.getBatchNormalizationTrainingExReserveSpaceSize( + handle, + cudnn_mode, + bn_ops, + 0, # activation desc + x_desc, + )) + reserve_space = _memory.alloc(reserve_space_size) + + cudnn.batchNormalizationForwardTrainingEx( + handle, + cudnn_mode, + bn_ops, + one, # alpha + zero, # beta + x_desc, x.data.ptr, # x + x_desc, 0, # z + x_desc, y.data.ptr, # y + derivedBnDesc, + gamma.data.ptr, + beta.data.ptr, + factor, + running_mean_tmp.data.ptr, + running_var_tmp.data.ptr, + eps, + save_mean.data.ptr, + save_inv_std.data.ptr, + 0, # activation + 0 if workspace is None else workspace.ptr, + 0 if workspace is None else workspace.mem.size, + 0 if reserve_space is None else reserve_space.ptr, + 0 if reserve_space is None else reserve_space.mem.size, + ) + + else: # cuDNN < 7401 + cudnn.batchNormalizationForwardTraining( + handle, cudnn_mode, one, zero, + x_desc, x.data.ptr, x_desc, y.data.ptr, + derivedBnDesc, gamma.data.ptr, + beta.data.ptr, factor, running_mean_tmp.data.ptr, + running_var_tmp.data.ptr, eps, + save_mean.data.ptr, save_inv_std.data.ptr) + + # Note: When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode is used, + # there is a possibility of numerical overflow. You can use + # queryRuntimeError() to make sure whether the overflow actually + # occurred or not during the batch normalization. + if debug and cudnn_mode == cudnn.CUDNN_BATCHNORM_SPATIAL_PERSISTENT: + query_mode = cudnn.CUDNN_ERRQUERY_BLOCKING + rstatus = cudnn.queryRuntimeError(handle, query_mode) + if rstatus != cudnn.CUDNN_STATUS_SUCCESS: + _warnings.warn( + 'A numerical overflow might have happened in cuDNN' + 'batch normalization (status:{})'.format(rstatus)) + finally: + cudnn.destroyTensorDescriptor(x_desc) + cudnn.destroyTensorDescriptor(derivedBnDesc) + if running_mean is not running_mean_tmp: + _core.elementwise_copy(running_mean_tmp, running_mean) + _core.elementwise_copy(running_var_tmp, running_var) + return reserve_space, y, save_mean, save_inv_std + + + def batch_normalization_forward_inference( + _ndarray_base x, _ndarray_base gamma, _ndarray_base beta, + _ndarray_base mean, _ndarray_base var, + double eps, bint is_for_conv2d, int cudnn_mode, + int d_layout=cudnn.CUDNN_TENSOR_NCHW): + x = core._internal_ascontiguousarray(x) + dtype = x.dtype + y = _core.ndarray(x._shape, dtype) + + cdef float float_one = 1 + cdef double double_zero = 0, double_one = 1 + cdef size_t zero = &double_zero, one + if x.dtype == 'd': + one = &double_one + else: + one = &float_one + + handle = get_handle() + cdef size_t x_desc = cudnn.createTensorDescriptor() + cdef size_t derivedBnDesc = cudnn.createTensorDescriptor() + try: + _create_tensor_descriptor_for_bn(x_desc, x, is_for_conv2d, + format=d_layout) + cudnn.deriveBNTensorDescriptor(derivedBnDesc, x_desc, cudnn_mode) + dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc) + if gamma.dtype != dtype_param: + gamma = gamma.astype(dtype_param) + beta = beta.astype(dtype_param) + mean = mean.astype(dtype_param) + var = var.astype(dtype_param) + else: + gamma = core._internal_ascontiguousarray(gamma) + beta = core._internal_ascontiguousarray(beta) + + cudnn.batchNormalizationForwardInference( + handle, cudnn_mode, one, zero, + x_desc, x.data.ptr, x_desc, y.data.ptr, + derivedBnDesc, gamma.data.ptr, beta.data.ptr, + mean.data.ptr, var.data.ptr, eps) + finally: + cudnn.destroyTensorDescriptor(x_desc) + cudnn.destroyTensorDescriptor(derivedBnDesc) + return y + + + def batch_normalization_backward( + _ndarray_base x, _ndarray_base gamma, _ndarray_base gy, + _ndarray_base mean, _ndarray_base inv_std, + double eps, bint is_for_conv2d, int cudnn_mode, bint debug, + int d_layout=cudnn.CUDNN_TENSOR_NCHW, + *, + _memory.MemoryPointer reserve_space=None, + ): + cdef _ndarray_base ggamma, gbeta + cdef bint need_cast + cdef _memory.MemoryPointer workspace = None + + x = core._internal_ascontiguousarray(x) + gy = core._internal_ascontiguousarray(gy) + dtype = x.dtype + gx = _core.ndarray(x._shape, dtype) + + cdef float float_one = 1 + cdef double double_zero = 0, double_one = 1 + cdef size_t zero = &double_zero, one + if x.dtype == 'd': + one = &double_one + else: + one = &float_one + + handle = get_handle() + cdef size_t x_desc = cudnn.createTensorDescriptor() + cdef size_t derivedBnDesc = cudnn.createTensorDescriptor() + try: + _create_tensor_descriptor_for_bn(x_desc, x, is_for_conv2d, + format=d_layout) + cudnn.deriveBNTensorDescriptor(derivedBnDesc, x_desc, cudnn_mode) + dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc) + need_cast = gamma.dtype != dtype_param + if need_cast: + gamma = gamma.astype(dtype_param) + else: + gamma = core._internal_ascontiguousarray(gamma) + ggamma = _core.ndarray(gamma._shape, dtype_param) + gbeta = _core.ndarray(gamma._shape, dtype_param) + + if cudnn_version() >= 7401: + bn_ops = cudnn.CUDNN_BATCHNORM_OPS_BN + workspace_size = ( - cudnn.getBatchNormalizationForwardTrainingExWorkspaceSize( + cudnn.getBatchNormalizationBackwardExWorkspaceSize( handle, cudnn_mode, bn_ops, - x_desc, # x - x_desc, # z + x_desc, x_desc, # y + x_desc, # dy + x_desc, # dz + x_desc, # dx derivedBnDesc, 0, # activation desc )) workspace = _memory.alloc(workspace_size) - - reserve_space_size = ( - cudnn.getBatchNormalizationTrainingExReserveSpaceSize( - handle, - cudnn_mode, - bn_ops, - 0, # activation desc - x_desc, - )) - reserve_space = _memory.alloc(reserve_space_size) - - cudnn.batchNormalizationForwardTrainingEx( - handle, - cudnn_mode, - bn_ops, - one, # alpha - zero, # beta - x_desc, x.data.ptr, # x - x_desc, 0, # z - x_desc, y.data.ptr, # y - derivedBnDesc, - gamma.data.ptr, - beta.data.ptr, - factor, - running_mean_tmp.data.ptr, - running_var_tmp.data.ptr, - eps, - save_mean.data.ptr, - save_inv_std.data.ptr, - 0, # activation - 0 if workspace is None else workspace.ptr, - 0 if workspace is None else workspace.mem.size, - 0 if reserve_space is None else reserve_space.ptr, - 0 if reserve_space is None else reserve_space.mem.size, - ) - - else: # cuDNN < 7401 - cudnn.batchNormalizationForwardTraining( - handle, cudnn_mode, one, zero, - x_desc, x.data.ptr, x_desc, y.data.ptr, - derivedBnDesc, gamma.data.ptr, - beta.data.ptr, factor, running_mean_tmp.data.ptr, - running_var_tmp.data.ptr, eps, - save_mean.data.ptr, save_inv_std.data.ptr) - - # Note: When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode is used, - # there is a possibility of numerical overflow. You can use - # queryRuntimeError() to make sure whether the overflow actually - # occurred or not during the batch normalization. - if debug and cudnn_mode == cudnn.CUDNN_BATCHNORM_SPATIAL_PERSISTENT: - query_mode = cudnn.CUDNN_ERRQUERY_BLOCKING - rstatus = cudnn.queryRuntimeError(handle, query_mode) - if rstatus != cudnn.CUDNN_STATUS_SUCCESS: - _warnings.warn( - 'A numerical overflow might have happened in cuDNN' - 'batch normalization (status:{})'.format(rstatus)) - finally: - cudnn.destroyTensorDescriptor(x_desc) - cudnn.destroyTensorDescriptor(derivedBnDesc) - if running_mean is not running_mean_tmp: - _core.elementwise_copy(running_mean_tmp, running_mean) - _core.elementwise_copy(running_var_tmp, running_var) - return reserve_space, y, save_mean, save_inv_std - - -def batch_normalization_forward_inference( - _ndarray_base x, _ndarray_base gamma, _ndarray_base beta, - _ndarray_base mean, _ndarray_base var, - double eps, bint is_for_conv2d, int cudnn_mode, - int d_layout=cudnn.CUDNN_TENSOR_NCHW): - x = core._internal_ascontiguousarray(x) - dtype = x.dtype - y = _core.ndarray(x._shape, dtype) - - cdef float float_one = 1 - cdef double double_zero = 0, double_one = 1 - cdef size_t zero = &double_zero, one - if x.dtype == 'd': - one = &double_one - else: - one = &float_one - - handle = get_handle() - cdef size_t x_desc = cudnn.createTensorDescriptor() - cdef size_t derivedBnDesc = cudnn.createTensorDescriptor() - try: - _create_tensor_descriptor_for_bn(x_desc, x, is_for_conv2d, - format=d_layout) - cudnn.deriveBNTensorDescriptor(derivedBnDesc, x_desc, cudnn_mode) - dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc) - if gamma.dtype != dtype_param: - gamma = gamma.astype(dtype_param) - beta = beta.astype(dtype_param) - mean = mean.astype(dtype_param) - var = var.astype(dtype_param) - else: - gamma = core._internal_ascontiguousarray(gamma) - beta = core._internal_ascontiguousarray(beta) - - cudnn.batchNormalizationForwardInference( - handle, cudnn_mode, one, zero, - x_desc, x.data.ptr, x_desc, y.data.ptr, - derivedBnDesc, gamma.data.ptr, beta.data.ptr, - mean.data.ptr, var.data.ptr, eps) - finally: - cudnn.destroyTensorDescriptor(x_desc) - cudnn.destroyTensorDescriptor(derivedBnDesc) - return y - - -def batch_normalization_backward( - _ndarray_base x, _ndarray_base gamma, _ndarray_base gy, - _ndarray_base mean, _ndarray_base inv_std, - double eps, bint is_for_conv2d, int cudnn_mode, bint debug, - int d_layout=cudnn.CUDNN_TENSOR_NCHW, - *, - _memory.MemoryPointer reserve_space=None, -): - cdef _ndarray_base ggamma, gbeta - cdef bint need_cast - cdef _memory.MemoryPointer workspace = None - - x = core._internal_ascontiguousarray(x) - gy = core._internal_ascontiguousarray(gy) - dtype = x.dtype - gx = _core.ndarray(x._shape, dtype) - - cdef float float_one = 1 - cdef double double_zero = 0, double_one = 1 - cdef size_t zero = &double_zero, one - if x.dtype == 'd': - one = &double_one - else: - one = &float_one - - handle = get_handle() - cdef size_t x_desc = cudnn.createTensorDescriptor() - cdef size_t derivedBnDesc = cudnn.createTensorDescriptor() - try: - _create_tensor_descriptor_for_bn(x_desc, x, is_for_conv2d, - format=d_layout) - cudnn.deriveBNTensorDescriptor(derivedBnDesc, x_desc, cudnn_mode) - dtype_param = _get_dtype_of_tensor_descriptor(derivedBnDesc) - need_cast = gamma.dtype != dtype_param - if need_cast: - gamma = gamma.astype(dtype_param) - else: - gamma = core._internal_ascontiguousarray(gamma) - ggamma = _core.ndarray(gamma._shape, dtype_param) - gbeta = _core.ndarray(gamma._shape, dtype_param) - - if cudnn_version() >= 7401: - bn_ops = cudnn.CUDNN_BATCHNORM_OPS_BN - - workspace_size = ( - cudnn.getBatchNormalizationBackwardExWorkspaceSize( + + cudnn.batchNormalizationBackwardEx( handle, cudnn_mode, bn_ops, - x_desc, - x_desc, # y - x_desc, # dy - x_desc, # dz - x_desc, # dx + one, zero, one, zero, + x_desc, x.data.ptr, + x_desc, 0, # y + x_desc, gy.data.ptr, + x_desc, 0, # dz + x_desc, gx.data.ptr, derivedBnDesc, + gamma.data.ptr, + 0, # beta + ggamma.data.ptr, + gbeta.data.ptr, + eps, + mean.data.ptr, + inv_std.data.ptr, 0, # activation desc - )) - workspace = _memory.alloc(workspace_size) - - cudnn.batchNormalizationBackwardEx( - handle, - cudnn_mode, - bn_ops, - one, zero, one, zero, - x_desc, x.data.ptr, - x_desc, 0, # y - x_desc, gy.data.ptr, - x_desc, 0, # dz - x_desc, gx.data.ptr, - derivedBnDesc, - gamma.data.ptr, - 0, # beta - ggamma.data.ptr, - gbeta.data.ptr, - eps, - mean.data.ptr, - inv_std.data.ptr, - 0, # activation desc - workspace, - workspace_size, - 0 if reserve_space is None else reserve_space.ptr, - 0 if reserve_space is None else reserve_space.mem.size, - ) - + workspace, + workspace_size, + 0 if reserve_space is None else reserve_space.ptr, + 0 if reserve_space is None else reserve_space.mem.size, + ) + + else: + # cuDNN < 7401 + if reserve_space is not None: + raise ValueError( + 'reserve_space can only be passed in cuDNN >= 7401') + cudnn.batchNormalizationBackward( + handle, cudnn_mode, one, zero, one, zero, + x_desc, x.data.ptr, + x_desc, gy.data.ptr, x_desc, gx.data.ptr, + derivedBnDesc, gamma.data.ptr, ggamma.data.ptr, gbeta.data.ptr, + eps, mean.data.ptr, inv_std.data.ptr) + + # Note: When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode is used, + # there is a possibility of numerical overflow. You can use + # queryRuntimeError() to make sure whether the overflow actually + # occurred or not during the batch normalization. + if debug and cudnn_mode == cudnn.CUDNN_BATCHNORM_SPATIAL_PERSISTENT: + query_mode = cudnn.CUDNN_ERRQUERY_BLOCKING + rstatus = cudnn.queryRuntimeError(handle, query_mode) + if rstatus != cudnn.CUDNN_STATUS_SUCCESS: + _warnings.warn( + 'A numerical overflow might have happened in cuDNN' + 'batch normalization (status:{})'.format(rstatus)) + finally: + cudnn.destroyTensorDescriptor(x_desc) + cudnn.destroyTensorDescriptor(derivedBnDesc) + + if need_cast: + ggamma = ggamma.astype(dtype) + gbeta = gbeta.astype(dtype) + return gx, ggamma, gbeta + + + def create_activation_descriptor(mode, relu_nan_opt=cudnn.CUDNN_PROPAGATE_NAN, + coef=0.0): + desc = Descriptor(cudnn.createActivationDescriptor(), + _py_cudnn.destroyActivationDescriptor) + cudnn.setActivationDescriptor(desc.value, mode, relu_nan_opt, coef) + return desc + + + def create_fused_ops_plan(ops): + plan = Descriptor(cudnn.createFusedOpsPlan(ops), + _py_cudnn.destroyFusedOpsPlan) + return plan + + + def create_fused_ops_const_param_pack(ops, list_attr_param): + const_pack = Descriptor(cudnn.createFusedOpsConstParamPack(ops), + _py_cudnn.destroyFusedOpsConstParamPack) + for attr, param in list_attr_param: + set_fused_ops_const_param_pack_attribute(const_pack, attr, param) + return const_pack + + + def make_fused_ops_plan(plan, const_pack): + handle = get_handle() + return cudnn.makeFusedOpsPlan(handle, plan.value, const_pack.value) + + + def create_fused_ops_variant_param_pack(ops, list_attr_param): + var_pack = Descriptor(cudnn.createFusedOpsVariantParamPack(ops), + _py_cudnn.destroyFusedOpsVariantParamPack) + for attr, param in list_attr_param: + set_fused_ops_variant_param_pack_attribute(var_pack, attr, param) + return var_pack + + + def fused_ops_execute(plan, var_pack): + handle = get_handle() + cudnn.fusedOpsExecute(handle, plan.value, var_pack.value) + + + cpdef set_fused_ops_const_param_pack_attribute( + Descriptor const_pack, int param_label, desc_or_scalar): + cdef int scalar + cdef Descriptor desc + if param_label in (cudnn.CUDNN_PARAM_XDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_MODE, + cudnn.CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER, + cudnn.CUDNN_PARAM_WDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_DWDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_YDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_DYDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_YSUM_PLACEHOLDER, + cudnn.CUDNN_PARAM_YSQSUM_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_SCALE_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_BIAS_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER, + cudnn.CUDNN_PARAM_ZDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER, + cudnn.CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER, + cudnn.CUDNN_PARAM_DXDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_DZDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_DSCALE_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_DBIAS_PLACEHOLDER): + scalar = desc_or_scalar + cudnn.setFusedOpsConstParamPackAttribute(const_pack.value, param_label, + &scalar) else: - # cuDNN < 7401 - if reserve_space is not None: - raise ValueError( - 'reserve_space can only be passed in cuDNN >= 7401') - cudnn.batchNormalizationBackward( - handle, cudnn_mode, one, zero, one, zero, - x_desc, x.data.ptr, - x_desc, gy.data.ptr, x_desc, gx.data.ptr, - derivedBnDesc, gamma.data.ptr, ggamma.data.ptr, gbeta.data.ptr, - eps, mean.data.ptr, inv_std.data.ptr) - - # Note: When the CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode is used, - # there is a possibility of numerical overflow. You can use - # queryRuntimeError() to make sure whether the overflow actually - # occurred or not during the batch normalization. - if debug and cudnn_mode == cudnn.CUDNN_BATCHNORM_SPATIAL_PERSISTENT: - query_mode = cudnn.CUDNN_ERRQUERY_BLOCKING - rstatus = cudnn.queryRuntimeError(handle, query_mode) - if rstatus != cudnn.CUDNN_STATUS_SUCCESS: - _warnings.warn( - 'A numerical overflow might have happened in cuDNN' - 'batch normalization (status:{})'.format(rstatus)) - finally: - cudnn.destroyTensorDescriptor(x_desc) - cudnn.destroyTensorDescriptor(derivedBnDesc) - - if need_cast: - ggamma = ggamma.astype(dtype) - gbeta = gbeta.astype(dtype) - return gx, ggamma, gbeta - - -def create_activation_descriptor(mode, relu_nan_opt=cudnn.CUDNN_PROPAGATE_NAN, - coef=0.0): - desc = Descriptor(cudnn.createActivationDescriptor(), - _py_cudnn.destroyActivationDescriptor) - cudnn.setActivationDescriptor(desc.value, mode, relu_nan_opt, coef) - return desc - - -def create_fused_ops_plan(ops): - plan = Descriptor(cudnn.createFusedOpsPlan(ops), - _py_cudnn.destroyFusedOpsPlan) - return plan - - -def create_fused_ops_const_param_pack(ops, list_attr_param): - const_pack = Descriptor(cudnn.createFusedOpsConstParamPack(ops), - _py_cudnn.destroyFusedOpsConstParamPack) - for attr, param in list_attr_param: - set_fused_ops_const_param_pack_attribute(const_pack, attr, param) - return const_pack - - -def make_fused_ops_plan(plan, const_pack): - handle = get_handle() - return cudnn.makeFusedOpsPlan(handle, plan.value, const_pack.value) - - -def create_fused_ops_variant_param_pack(ops, list_attr_param): - var_pack = Descriptor(cudnn.createFusedOpsVariantParamPack(ops), - _py_cudnn.destroyFusedOpsVariantParamPack) - for attr, param in list_attr_param: - set_fused_ops_variant_param_pack_attribute(var_pack, attr, param) - return var_pack - - -def fused_ops_execute(plan, var_pack): - handle = get_handle() - cudnn.fusedOpsExecute(handle, plan.value, var_pack.value) - - -cpdef set_fused_ops_const_param_pack_attribute( - Descriptor const_pack, int param_label, desc_or_scalar): - cdef int scalar - cdef Descriptor desc - if param_label in (cudnn.CUDNN_PARAM_XDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_MODE, - cudnn.CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER, - cudnn.CUDNN_PARAM_WDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_DWDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_YDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_DYDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_YSUM_PLACEHOLDER, - cudnn.CUDNN_PARAM_YSQSUM_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_SCALE_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_BIAS_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER, - cudnn.CUDNN_PARAM_ZDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER, - cudnn.CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER, - cudnn.CUDNN_PARAM_DXDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_DZDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_DSCALE_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_DBIAS_PLACEHOLDER): - scalar = desc_or_scalar - cudnn.setFusedOpsConstParamPackAttribute(const_pack.value, param_label, - &scalar) - else: - desc = desc_or_scalar - cudnn.setFusedOpsConstParamPackAttribute(const_pack.value, param_label, - desc.value) - - -cpdef get_fused_ops_const_param_pack_attribute(Descriptor const_pack, - int param_label): - cdef int param_int - cdef size_t param_desc - if param_label in (cudnn.CUDNN_PARAM_XDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_MODE, - cudnn.CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER, - cudnn.CUDNN_PARAM_WDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_DWDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_YDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_DYDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_YSUM_PLACEHOLDER, - cudnn.CUDNN_PARAM_YSQSUM_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_SCALE_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_BIAS_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER, - cudnn.CUDNN_PARAM_ZDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER, - cudnn.CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER, - cudnn.CUDNN_PARAM_DXDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_DZDATA_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_DSCALE_PLACEHOLDER, - cudnn.CUDNN_PARAM_BN_DBIAS_PLACEHOLDER): - is_null = cudnn.getFusedOpsConstParamPackAttribute( - const_pack.value, param_label, ¶m_int) - return param_int, is_null - else: - if param_label == cudnn.CUDNN_PARAM_ACTIVATION_DESC: - param_desc = cudnn.createActivationDescriptor() - elif param_label == cudnn.CUDNN_PARAM_CONV_DESC: - param_desc = cudnn.createConvolutionDescriptor() - elif param_label in (cudnn.CUDNN_PARAM_WDESC, - cudnn.CUDNN_PARAM_DWDESC,): - param_desc = cudnn.createFilterDescriptor() + desc = desc_or_scalar + cudnn.setFusedOpsConstParamPackAttribute(const_pack.value, param_label, + desc.value) + + + cpdef get_fused_ops_const_param_pack_attribute(Descriptor const_pack, + int param_label): + cdef int param_int + cdef size_t param_desc + if param_label in (cudnn.CUDNN_PARAM_XDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_MODE, + cudnn.CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER, + cudnn.CUDNN_PARAM_WDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_DWDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_YDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_DYDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_YSUM_PLACEHOLDER, + cudnn.CUDNN_PARAM_YSQSUM_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_SCALE_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_BIAS_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER, + cudnn.CUDNN_PARAM_ZDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER, + cudnn.CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER, + cudnn.CUDNN_PARAM_DXDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_DZDATA_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_DSCALE_PLACEHOLDER, + cudnn.CUDNN_PARAM_BN_DBIAS_PLACEHOLDER): + is_null = cudnn.getFusedOpsConstParamPackAttribute( + const_pack.value, param_label, ¶m_int) + return param_int, is_null else: - param_desc = cudnn.createTensorDescriptor() - is_null = cudnn.getFusedOpsConstParamPackAttribute( - const_pack.value, param_label, param_desc) - return param_desc, is_null - - -cpdef set_fused_ops_variant_param_pack_attribute( - Descriptor var_pack, int param_label, arr_or_scaler): - cdef size_t scalar_size_t - cdef int64_t scalar_int64_t - cdef double scalar_double - cdef size_t ptr - if param_label == cudnn.CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES: - scalar_size_t = arr_or_scaler - cudnn.setFusedOpsVariantParamPackAttribute(var_pack.value, param_label, - &scalar_size_t) - elif param_label == cudnn.CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT: - scalar_int64_t = arr_or_scaler - cudnn.setFusedOpsVariantParamPackAttribute(var_pack.value, param_label, - &scalar_int64_t) - elif param_label in (cudnn.CUDNN_SCALAR_DOUBLE_BN_EPSILON, - cudnn.CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR): - scalar_double = arr_or_scaler - cudnn.setFusedOpsVariantParamPackAttribute(var_pack.value, param_label, - &scalar_double) - else: - ptr = arr_or_scaler.data.ptr - cudnn.setFusedOpsVariantParamPackAttribute(var_pack.value, param_label, - ptr) - - -cpdef get_fused_ops_variant_param_pack_attribute(size_t var_pack, - int param_label): - cdef size_t scalar_size_t - cdef int64_t scalar_int64_t - cdef double scalar_double - cdef size_t ptr - if param_label == cudnn.CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES: - cudnn.getFusedOpsVariantParamPackAttribute(var_pack, param_label, - &scalar_size_t) - return scalar_size_t - elif param_label == cudnn.CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT: - cudnn.getFusedOpsVariantParamPackAttribute(var_pack, param_label, - &scalar_int64_t) - return scalar_int64_t - elif param_label in (cudnn.CUDNN_SCALAR_DOUBLE_BN_EPSILON, - cudnn.CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR): - cudnn.getFusedOpsVariantParamPackAttribute(var_pack, param_label, - &scalar_double) - return scalar_double - else: - cudnn.getFusedOpsVariantParamPackAttribute(var_pack, param_label, - &ptr) - return ptr + if param_label == cudnn.CUDNN_PARAM_ACTIVATION_DESC: + param_desc = cudnn.createActivationDescriptor() + elif param_label == cudnn.CUDNN_PARAM_CONV_DESC: + param_desc = cudnn.createConvolutionDescriptor() + elif param_label in (cudnn.CUDNN_PARAM_WDESC, + cudnn.CUDNN_PARAM_DWDESC,): + param_desc = cudnn.createFilterDescriptor() + else: + param_desc = cudnn.createTensorDescriptor() + is_null = cudnn.getFusedOpsConstParamPackAttribute( + const_pack.value, param_label, param_desc) + return param_desc, is_null + + + cpdef set_fused_ops_variant_param_pack_attribute( + Descriptor var_pack, int param_label, arr_or_scaler): + cdef size_t scalar_size_t + cdef int64_t scalar_int64_t + cdef double scalar_double + cdef size_t ptr + if param_label == cudnn.CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES: + scalar_size_t = arr_or_scaler + cudnn.setFusedOpsVariantParamPackAttribute(var_pack.value, param_label, + &scalar_size_t) + elif param_label == cudnn.CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT: + scalar_int64_t = arr_or_scaler + cudnn.setFusedOpsVariantParamPackAttribute(var_pack.value, param_label, + &scalar_int64_t) + elif param_label in (cudnn.CUDNN_SCALAR_DOUBLE_BN_EPSILON, + cudnn.CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR): + scalar_double = arr_or_scaler + cudnn.setFusedOpsVariantParamPackAttribute(var_pack.value, param_label, + &scalar_double) + else: + ptr = arr_or_scaler.data.ptr + cudnn.setFusedOpsVariantParamPackAttribute(var_pack.value, param_label, + ptr) + + + cpdef get_fused_ops_variant_param_pack_attribute(size_t var_pack, + int param_label): + cdef size_t scalar_size_t + cdef int64_t scalar_int64_t + cdef double scalar_double + cdef size_t ptr + if param_label == cudnn.CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES: + cudnn.getFusedOpsVariantParamPackAttribute(var_pack, param_label, + &scalar_size_t) + return scalar_size_t + elif param_label == cudnn.CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT: + cudnn.getFusedOpsVariantParamPackAttribute(var_pack, param_label, + &scalar_int64_t) + return scalar_int64_t + elif param_label in (cudnn.CUDNN_SCALAR_DOUBLE_BN_EPSILON, + cudnn.CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR): + cudnn.getFusedOpsVariantParamPackAttribute(var_pack, param_label, + &scalar_double) + return scalar_double + else: + cudnn.getFusedOpsVariantParamPackAttribute(var_pack, param_label, + &ptr) + return ptr diff --git a/install/cupy_builder/_features.py b/install/cupy_builder/_features.py index 4ad926b4249..43c53d43f0d 100644 --- a/install/cupy_builder/_features.py +++ b/install/cupy_builder/_features.py @@ -164,6 +164,8 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'cupyx.cusolver', 'cupy_backends.cuda.libs.curand_hip', 'cupy_backends.cuda.libs.nvrtc_hip', + 'cupy_backends.cuda.libs.cudnn', + 'cupyx.cudnn', ], 'include': [ 'hip/hip_runtime_api.h', @@ -175,6 +177,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'roctx.h', 'rocsolver/rocsolver.h' if rocm_version >= 560 else 'rocsolver.h', 'hipsolver/hipsolver.h' if rocm_version >= 560 else 'hipsolver.h', + 'miopen/miopen.h', ], 'libraries': [ 'amdhip64', # was hiprtc and hip_hcc before ROCm 3.8.0 @@ -188,6 +191,7 @@ def get_features(ctx: Context) -> Dict[str, Feature]: 'rocsolver', 'rocsparse', 'hipsolver', + 'MIOpen', ], 'check_method': build.check_hip_version, 'version_method': build.get_hip_version, diff --git a/tests/cupyx_tests/test_cudnn.py b/tests/cupyx_tests/test_cudnn.py index 84ef7b02071..d45e6f78f46 100644 --- a/tests/cupyx_tests/test_cudnn.py +++ b/tests/cupyx_tests/test_cudnn.py @@ -13,7 +13,7 @@ if cudnn_enabled: modes = [ - libcudnn.CUDNN_ACTIVATION_SIGMOID, + #libcudnn.CUDNN_ACTIVATION_SIGMOID, do a if hip condition libcudnn.CUDNN_ACTIVATION_RELU, libcudnn.CUDNN_ACTIVATION_TANH, ] @@ -40,7 +40,6 @@ 'dtype': [numpy.float32, numpy.float64], 'mode': modes, })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnActivation: @pytest.fixture(autouse=True) @@ -60,7 +59,6 @@ def test_activation_backward(self): 'dtype': [numpy.float32, numpy.float64], 'mode': coef_modes, })) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestCudnnActivationCoef: @pytest.fixture(autouse=True) @@ -136,7 +134,6 @@ def test_dropout_seed(self): 'bias': [True, False], 'layout': layouts, }))) -@pytest.mark.skipif(not cudnn_enabled, reason='cuDNN is not available') class TestConvolutionForward: @pytest.fixture(autouse=True)