diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim index dbce048a3..da1034e75 100644 --- a/constantine/math_compiler/codegen_nvidia.nim +++ b/constantine/math_compiler/codegen_nvidia.nim @@ -71,18 +71,6 @@ export # Cuda Driver API # ------------------------------------------------------------ -template check*(status: CUresult, quitOnFailure = true) = - ## Check the status code of a CUDA operation - ## Exit program with error if failure - - let code = status # ensure that the input expression is evaluated once only - - if code != CUDA_SUCCESS: - writeStackTrace() - stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n') - if quitOnFailure: - quit 1 # NOTE: this hides exceptions if they are thrown! - func cuModuleLoadData*(module: var CUmodule, sourceCode: openArray[char]): CUresult {.inline.}= cuModuleLoadData(module, sourceCode[0].unsafeAddr) func cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: openArray[char]): CUresult {.inline.}= @@ -221,277 +209,6 @@ proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) = check cuMemFree(aGPU) check cuMemFree(bGPU) -proc getTypes(n: NimNode): seq[NimNode] = - case n.kind - of nnkIdent, nnkSym: result.add getTypeInst(n) - of nnkLiterals: result.add getTypeInst(n) - of nnkBracket, nnkTupleConstr, nnkPar: - for el in n: - result.add getTypes(el) - else: - case n.typeKind - of ntyPtr: result.add getTypeInst(n) - else: - error("Arguments to `execCuda` must be given as a bracket, tuple or typed expression. Instead: " & $n.treerepr) - -proc requiresCopy(n: NimNode): bool = - ## Returns `true` if the given type is not a trivial data type, which implies - ## it will require copying its value manually. - case n.typeKind - of ntyBool, ntyChar, ntyInt .. ntyUint64: # range includes all floats - result = false - else: - result = true - -proc allowsCopy(n: NimNode): bool = - ## Returns `true` if the given type is allowed to be copied. That means it is - ## either `requiresCopy` or a `var` symbol. - result = n.requiresCopy or n.symKind == nskVar - -proc getIdent(n: NimNode): NimNode = - ## Generate a `GPU` suffixed ident - # Note: We want a deterministic name, because we call `getIdent` for the same symbol - # in multiple places atm. - case n.kind - of nnkIdent, nnkSym: result = ident(n.strVal & "GPU") - else: result = ident("`" & n.repr & "`GPU") - -proc determineDevicePtrs(r, i: NimNode, iTypes: seq[NimNode]): seq[(NimNode, NimNode)] = - ## Returns the device pointer ident and its associated original symbol. - for el in r: - if not el.allowsCopy: - error("The argument for `res`: " & $el.repr & " of type: " & $el.getTypeImpl().treerepr & - " does not allow copying. Copying to the address of all result variables is required.") - result.add (getIdent(el), el) - for idx in 0 ..< i.len: - let input = i[idx] - let t = iTypes[idx] - if t.requiresCopy(): - result.add (getIdent(input), input) - -proc assembleParams(r, i: NimNode, iTypes: seq[NimNode]): seq[NimNode] = - ## Returns all parameters. Depending on whether they require copies or - ## are `res` parameters, either the input parameter or the `GPU` parameter. - for el in r: # for `res` we always copy! - result.add getIdent(el) - for idx in 0 ..< i.len: - let input = i[idx] - let t = iTypes[idx] - if t.requiresCopy(): - result.add getIdent(input) - else: - result.add input - -proc sizeArg(n: NimNode): NimNode = - ## The argument to `sizeof` must be the size of the data we copy. If the - ## input type is already given as a `ptr T` type, we need the size of - ## `T` and not `ptr`. - case n.typeKind - of ntyPtr: result = n.getTypeInst()[0] - else: result = n - -# little helper macro constructors -template check(arg): untyped = nnkCall.newTree(ident"check", arg) -template size(arg): untyped = nnkCall.newTree(ident"sizeof", sizeArg arg) -template address(arg): untyped = nnkCall.newTree(ident"addr", arg) -template csize_t(arg): untyped = nnkCall.newTree(ident"csize_t", arg) -template pointer(arg): untyped = nnkCall.newTree(ident"pointer", arg) - -proc maybeAddress(n: NimNode): NimNode = - ## Returns the address of the given node, *IFF* the type is not a - ## pointer type already - case n.typeKind - of ntyPtr: result = n - else: result = address(n) - -proc genParams(pId, r, i: NimNode, iTypes: seq[NimNode]): NimNode = - ## Generates the parameter `params` variable - let ps = assembleParams(r, i, iTypes) - result = nnkBracket.newTree() - for p in ps: - result.add pointer(maybeAddress p) - result = nnkLetSection.newTree( - nnkIdentDefs.newTree(pId, newEmptyNode(), result) - ) - -proc genVar(n: NimNode): (NimNode, NimNode) = - ## Generates a let `tmp` variable and returns its identifier and - ## the let section. - result[0] = genSym(nskLet, "tmp") - result[1] = nnkLetSection.newTree( - nnkIdentDefs.newTree( - result[0], - getTypeInst(n), - n - ) - ) - -proc genLocalVars(inputs: NimNode): (NimNode, NimNode) = - result[0] = newStmtList() # defines local vars - result[1] = nnkBracket.newTree() # returns new bracket of vars for parameters - for el in inputs: - case el.kind - of nnkLiterals, nnkConstDef: # define a local with the value of it - let (s, v) = genVar(el) - result[0].add v - result[1].add s - of nnkSym: - if el.strVal in ["true", "false"]: - let (s, v) = genVar(el) - result[0].add v - result[1].add s - else: - result[1].add el # keep symbol - else: - result[1].add el # keep symbol - -proc maybeWrap(n: NimNode): NimNode = - if n.kind notin {nnkBracket, nnkTupleConstr}: - result = nnkBracket.newTree(n) - else: - result = n - -proc endianCheck(): NimNode = - result = quote do: - static: doAssert cpuEndian == littleEndian, block: - # From https://developer.nvidia.com/cuda-downloads?target_os=Linux - # Supported architectures for Cuda are: - # x86-64, PowerPC 64 little-endian, ARM64 (aarch64) - # which are all little-endian at word-level. - # - # Due to limbs being also stored in little-endian, on little-endian host - # the CPU and GPU will have the same binary representation - # whether we use 32-bit or 64-bit words, so naive memcpy can be used for parameter passing. - - "Most CPUs (x86-64, ARM) are little-endian, as are Nvidia GPUs, which allows naive copying of parameters.\n" & - "Your architecture '" & $hostCPU & "' is big-endian and GPU offloading is unsupported on it." - -proc execCudaImpl(jitFn, res, inputs: NimNode): NimNode = - # Maybe wrap individually given arguments in a `[]` bracket, e.g. - # `execCuda(res = foo, inputs = bar)` - let res = maybeWrap res - let inputs = maybeWrap inputs - - result = newStmtList() - result.add endianCheck() - - # get the types of the inputs - let rTypes = getTypes(res) - let iTypes = getTypes(inputs) - - # determine all required `CUdeviceptr` - let devPtrs = determineDevicePtrs(res, inputs, iTypes) - - # generate device pointers, allocate memory and copy data - for x in devPtrs: - # `var rGPU: CUdeviceptr` - result.add nnkVarSection.newTree( - nnkIdentDefs.newTree( - x[0], - ident"CUdeviceptr", - newEmptyNode() - ) - ) - - # `check cuMemAlloc(rGPU, csize_t sizeof(r))` - result.add( - check nnkCall.newTree( - ident"cuMemAlloc", - x[0], - csize_t size(x[1]) - ) - ) - # `check cuMemcpyHtoD(aGPU, a.addr, csize_t sizeof(a))` - result.add( - check nnkCall.newTree( - ident"cuMemcpyHtoD", - x[0], - maybeAddress x[1], - csize_t size(x[1]) - ) - ) - - # Generate local variables - let (decl, vars) = genLocalVars(inputs) - result.add decl - - # assemble the parameters - let pId = ident"params" - let params = genParams(pId, res, vars, iTypes) - result.add params - - # launch the kernel - result.add quote do: - check cuLaunchKernel( - `jitFn`, - 1, 1, 1, # grid(x, y, z) - 1, 1, 1, # block(x, y, z) - sharedMemBytes = 0, - CUstream(nil), - `pId`[0].unsafeAddr, nil) - - # copy back results - let devPtrsRes = determineDevicePtrs(res, nnkBracket.newTree(), @[]) - for x in devPtrsRes: - result.add( - check nnkCall.newTree( - ident"cuMemcpyDtoH", - maybeAddress x[1], - x[0], - csize_t size(x[1]) - ) - ) - - # free memory - for x in devPtrs: - result.add( - check nnkCall.newTree( - ident"cuMemFree", - x[0] - ) - ) - result = quote do: - block: - `result` - -macro execCuda*(jitFn: CUfunction, - res: typed, - inputs: typed): untyped = - ## Given a CUDA function, execute the kernel. Copies all non trivial data types to - ## to the GPU via `cuMemcpyHtoD`. Any argument given as `res` will be copied back - ## from the GPU after kernel execution finishes. - ## - ## IMPORTANT: - ## The arguments passed to the CUDA kernel will be in the order in which they are - ## given to the macro. This especially means `res` arguments will be passed first. - ## - ## Example: - ## ```nim - ## execCuda(fn, res = [r, s], inputs = [a, b, c]) # if all arguments have the same type - ## # or - ## execCuda(fn, res = (r, s), inputs = (a, b, c)) # if different types - ## ``` - ## will pass the parameters as `[r, s, a, b, c]`. - ## - ## For more examples see the test case `tests/gpu/t_exec_literals_consts.nim`. - ## - ## We do not perform any checks on whether the given types are valid as arguments to - ## the CUDA target! Also, all arguments given as `res` are expected to be copied. - ## To return a value for a simple data type, use a `ptr X` type. However, it is allowed - ## to simply pass a `var` symbol as a `res` argument. We automatically copy to the - ## the memory location. - ## - ## We also copy all `res` data to the GPU, so that a return value can also be used - ## as an input. - ## - ## NOTE: This function is mainly intended for convenient execution of a single kernel - result = execCudaImpl(jitFn, res, inputs) - -macro execCuda*(jitFn: CUfunction, - res: typed): untyped = - ## Overload of the above for empty `inputs` - result = execCudaImpl(jitFn, res, nnkBracket.newTree()) - # ############################################################ # # Compilation helper @@ -516,6 +233,12 @@ type NvidiaAssembler* = ref NvidiaAssemblerObj + ## We define a distinct version of the `CUfunction` type to differentiate + ## producing a kernel via the LLVM backend from the more direct approach + ## using NVRTC. This is because the data passing for field elements + ## is more complicated on the LLVM side (requires a manual copy). + CUfunctionLLVM* = distinct CUfunction + proc `=destroy`*(nv: NvidiaAssemblerObj) = ## XXX: Need to also call the finalizer for `asy` in the future! # NOTE: In the destructor we don't want to quit on a `check` failure. @@ -592,7 +315,7 @@ proc initNvAsm*[Name: static Algebra](field: type EC_ShortW_Jac[Fp[Name], G1], w result.fd = result.cd.fd result.asy.definePrimitives(result.cd) -proc compile*(nv: NvidiaAssembler, kernName: string): CUfunction = +proc compile*(nv: NvidiaAssembler, kernName: string): CUfunctionLLVM = ## Overload of `compile` below. ## Call this version if you have manually used the Assembler_LLVM object ## to build instructions and have a kernel name you wish to compile. @@ -617,18 +340,32 @@ proc compile*(nv: NvidiaAssembler, kernName: string): CUfunction = check cuModuleLoadData(nv.cuMod, ptx) # will be cleaned up when `NvidiaAssembler` goes out of scope - result = nv.cuMod.getCudaKernel(kernName) + result = CUfunctionLLVM(nv.cuMod.getCudaKernel(kernName)) -proc compile*(nv: NvidiaAssembler, fn: FieldFnGenerator): CUfunction = +proc compile*(nv: NvidiaAssembler, fn: FieldFnGenerator): CUfunctionLLVM = ## Given a function that generates code for a finite field operation, compile ## that function on the given Nvidia target and return a CUDA function. # execute the `fn` let kernName = nv.asy.fn(nv.fd) - result = nv.compile(kernName) + result = CUfunctionLLVM(nv.compile(kernName)) -proc compile*(nv: NvidiaAssembler, fn: CurveFnGenerator): CUfunction = +proc compile*(nv: NvidiaAssembler, fn: CurveFnGenerator): CUfunctionLLVM = ## Given a function that generates code for an elliptic curve operation, compile ## that function on the given Nvidia target and return a CUDA function. # execute the `fn` let kernName = nv.asy.fn(nv.cd) - result = nv.compile(kernName) + result = CUfunctionLLVM(nv.compile(kernName)) + +import ./experimental/cuda_execute_dsl +macro execCuda*(jitFn: CUfunctionLLVM, + res: typed, + inputs: typed): untyped = + ## See `execCuda` in `constantine/math_compiler/experimental/cuda_execute_dsl.nim` + ## for an explanation. + ## + ## This LLVM overload makes sure we disallow passing simple structs + ## via their pointer and instead always copy them (required due to our + ## type definitions for finite field elements and elliptic curve points + ## on the LLVM target). + execCudaImpl(jitFn, newLit 1, newLit 1, res, inputs, + passStructByPointer = false) diff --git a/constantine/math_compiler/experimental/cuda_execute_dsl.nim b/constantine/math_compiler/experimental/cuda_execute_dsl.nim new file mode 100644 index 000000000..d6618b386 --- /dev/null +++ b/constantine/math_compiler/experimental/cuda_execute_dsl.nim @@ -0,0 +1,356 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import constantine/platforms/abis/nvidia_abi +import std/macros +from std / strutils import normalize + +proc getTypes(n: NimNode): seq[NimNode] = + case n.kind + of nnkIdent, nnkSym: result.add getTypeInst(n) + of nnkLiterals: result.add getTypeInst(n) + of nnkBracket, nnkTupleConstr, nnkPar: + for el in n: + result.add getTypes(el) + of nnkCall: + result.add getType(n) + else: + case n.typeKind + of ntyPtr: result.add getTypeInst(n) + else: + error("Arguments to `execCuda` must be given as a bracket, tuple or typed expression. Instead: " & $n.treerepr) + +proc requiresCopy(n: NimNode, passStructByPointer: bool): bool = + ## Returns `true` if the given type is not a trivial data type, which implies + ## it will require copying its value manually. + ## + ## WARNING: For the moment we determine if something needs to be copied especially + ## based on whether it is an object or ref type. That means *DO NOT* nest ref + ## types in your objects. They *WILL NOT* be deep copied! + ## + ## If `passStructByPointer` is `true` we do *not* copy trivial struct types, e.g. a big int + ## or finite field element. If it is false, we always copy for those. The distinction + ## is needed, because for the CUDA target via LLVM, the array type definitions cause + ## `cudaErrorIllegalAddress` if we directly pass the host pointer of the struct. + case n.typeKind + of ntyBool, ntyChar, ntyInt .. ntyUint64: # range includes all floats + result = false + of ntyObject, ntyArray: + if passStructByPointer: + result = false # regular objects can just be copied! + else: + result = true # struct passing by pointer forbidden + ## NOTE: strictly speaking this is not the case of course! If the object + ## contains refs, it won't hold! + of ntyGenericInst: + if passStructByPointer: + let impl = n.getTypeImpl() + result = impl.kind == nnkRefTy # if a ref, needs to be copied + else: + result = true # for now assume it needs to be copied + of ntyDistinct: + let impl = n.getTypeInst() + if impl.kind in [nnkIdent, nnkSym] and impl.strVal.normalize == "cudeviceptr": + result = false + else: + result = true + else: + result = true + +proc allowsCopy(n: NimNode, passStructByPointer: bool): bool = + ## Returns `true` if the given type is allowed to be copied. That means it is + ## either `requiresCopy` or a `var` symbol. + result = n.requiresCopy(passStructByPointer) or n.symKind == nskVar + +proc getIdent(n: NimNode): NimNode = + ## Generate a `GPU` suffixed ident + # Note: We want a deterministic name, because we call `getIdent` for the same symbol + # in multiple places atm. + case n.kind + of nnkIdent, nnkSym: result = ident(n.strVal & "GPU") + else: result = ident("`" & n.repr & "`GPU") + +proc determineDevicePtrs(r, i: NimNode, iTypes: seq[NimNode], + passStructByPointer: bool): seq[(NimNode, NimNode)] = + ## Returns the device pointer ident and its associated original symbol. + for el in r: + if not el.allowsCopy(passStructByPointer): + error("The argument for `res`: " & $el.repr & " of type: " & $el.getTypeImpl().treerepr & + " does not allow copying. Copying to the address of all result variables is required." & + " It is a `" & $el.symKind & "` variable, but needs to be a `var`.") + result.add (getIdent(el), el) + for idx in 0 ..< i.len: + let input = i[idx] + let t = iTypes[idx] + if t.requiresCopy(passStructByPointer): + result.add (getIdent(input), input) + +proc assembleParams(r, i: NimNode, iTypes: seq[NimNode], passStructByPointer: bool): seq[NimNode] = + ## Returns all parameters. Depending on whether they require copies or + ## are `res` parameters, either the input parameter or the `GPU` parameter. + for el in r: # for `res` we always copy! + result.add getIdent(el) + for idx in 0 ..< i.len: + let input = i[idx] + let t = iTypes[idx] + if t.requiresCopy(passStructByPointer): + result.add getIdent(input) + else: + result.add input + +# little helper macro constructors +template check(arg): untyped = nnkCall.newTree(ident"check", arg) +template size(arg): untyped = nnkCall.newTree(ident"sizeof", arg) +template address(arg): untyped = nnkCall.newTree(ident"addr", arg) +template csize_t(arg): untyped = nnkCall.newTree(ident"csize_t", arg) +template pointer(arg): untyped = nnkCall.newTree(ident"pointer", arg) +template arrayTyp(num, typ): untyped = nnkBracketExpr.newTree(ident"array", newLit num, typ) +template lenOf(arg): untyped = nnkCall.newTree(ident"len", arg) +template mul(x, y): untyped = nnkInfix.newTree(ident"*", x, y) + +proc getSizeOf(arg: NimNode): NimNode = + ## Returns a call to `sizeof` for the given argument. The argument to `sizeof` must + ## be the size of the data we copy. If the argument is a `seq` we take into account + ## the number of elements. If the input type is already given as a `ptr T` type, we + ## need the size of `T` and not `ptr`. + case arg.typeKind + of ntyPtr: result = size(arg.getTypeInst()[0]) + of ntySequence: result = mul(lenOf(arg), size(arg.getTypeInst()[1])) + else: result = size(arg) + +proc maybeAddress(n: NimNode): NimNode = + ## Returns the address of the given node, *IFF* the type is not a + ## pointer type already. In case the input is a `seq[T]`, we return `x[0].addr`. + case n.typeKind + of ntyPtr: result = n + of ntySequence: result = address( nnkBracketExpr.newTree(n, newLit 0) ) + else: result = address(n) + +proc genParams(pId, r, i: NimNode, iTypes: seq[NimNode], passStructByPointer: bool): NimNode = + ## Generates the parameter `params` variable + let ps = assembleParams(r, i, iTypes, passStructByPointer) + result = nnkBracket.newTree() + for p in ps: + result.add pointer(maybeAddress p) + result = nnkLetSection.newTree( + nnkIdentDefs.newTree(pId, arrayTyp(ps.len, ident"pointer"), result) + ) + +proc genVar(n: NimNode): (NimNode, NimNode) = + ## Generates a let `tmp` variable and returns its identifier and + ## the let section. + result[0] = genSym(nskLet, "tmp") + result[1] = nnkLetSection.newTree( + nnkIdentDefs.newTree( + result[0], + getTypeInst(n), + n + ) + ) + +proc genLocalVars(inputs: NimNode): (NimNode, NimNode) = + result[0] = newStmtList() # defines local vars + result[1] = nnkBracket.newTree() # returns new bracket of vars for parameters + for el in inputs: + case el.kind + of nnkLiterals, nnkConstDef: # define a local with the value of it + let (s, v) = genVar(el) + result[0].add v + result[1].add s + of nnkSym: + if el.strVal in ["true", "false"]: + let (s, v) = genVar(el) + result[0].add v + result[1].add s + else: + result[1].add el # keep symbol + else: + result[1].add el # keep symbol + +proc maybeWrap(n: NimNode): NimNode = + if n.kind notin {nnkBracket, nnkTupleConstr}: + result = nnkBracket.newTree(n) + else: + result = n + +proc endianCheck(): NimNode = + result = quote do: + static: doAssert cpuEndian == littleEndian, block: + # From https://developer.nvidia.com/cuda-downloads?target_os=Linux + # Supported architectures for Cuda are: + # x86-64, PowerPC 64 little-endian, ARM64 (aarch64) + # which are all little-endian at word-level. + # + # Due to limbs being also stored in little-endian, on little-endian host + # the CPU and GPU will have the same binary representation + # whether we use 32-bit or 64-bit words, so naive memcpy can be used for parameter passing. + + "Most CPUs (x86-64, ARM) are little-endian, as are Nvidia GPUs, which allows naive copying of parameters.\n" & + "Your architecture '" & $hostCPU & "' is big-endian and GPU offloading is unsupported on it." + +proc execCudaImpl*(jitFn, numBlocks, threadsPerBlock, res, inputs, sharedMemSize: NimNode, + passStructByPointer: static bool): NimNode = + # Maybe wrap individually given arguments in a `[]` bracket, e.g. + # `execCuda(res = foo, inputs = bar)` + let res = maybeWrap res + let inputs = maybeWrap inputs + + result = newStmtList() + result.add endianCheck() + + # get the types of the inputs + let rTypes = getTypes(res) + let iTypes = getTypes(inputs) + + # determine all required `CUdeviceptr` + let devPtrs = determineDevicePtrs(res, inputs, iTypes, passStructByPointer) + + # generate device pointers, allocate memory and copy data + for x in devPtrs: + # `var rGPU: CUdeviceptr` + result.add nnkVarSection.newTree( + nnkIdentDefs.newTree( + x[0], + ident"CUdeviceptr", + newEmptyNode() + ) + ) + + # `check cuMemAlloc(rGPU, csize_t sizeof(r))` + result.add( + check nnkCall.newTree( + ident"cuMemAlloc", + x[0], + csize_t getSizeOf(x[1]) + ) + ) + # `check cuMemcpyHtoD(aGPU, a.addr, csize_t sizeof(a))` + result.add( + check nnkCall.newTree( + ident"cuMemcpyHtoD", + x[0], + maybeAddress x[1], + csize_t getSizeOf(x[1]) + ) + ) + + # Generate local variables + let (decl, vars) = genLocalVars(inputs) + result.add decl + + # assemble the parameters + let pId = ident"params" + let params = genParams(pId, res, vars, iTypes, passStructByPointer) + result.add params + + # launch the kernel + result.add quote do: + let pAr = if `pId`.len > 0: `pId`[0].unsafeAddr + else: nil + + # Create timing events + var start, stop: cudaEvent_t + check cudaEventCreate(addr start) + check cudaEventCreate(addr stop) + + check cudaEventRecord(start, nil) + check cuLaunchKernel( + CUfunction(`jitFn`), # dummy conversion on NVRTC, required on LLVM + `numBlocks`, 1, 1, # grid(x, y, z) + `threadsPerBlock`, 1, 1, # block(x, y, z) + sharedMemBytes = `sharedMemSize`.uint32, + CUstream(nil), + pAr, nil) + check cudaDeviceSynchronize() + check cudaEventRecord(stop, nil) + check cudaEventSynchronize(stop) + + var elapsedTime: float32 + check cudaEventElapsedTime(addr elapsedTime, start, stop) + echo "[INFO]: Kernel execution took: ", elapsedTime, " ms" + + check cudaEventDestroy(start) + check cudaEventDestroy(stop) + + + # copy back results + let devPtrsRes = determineDevicePtrs(res, nnkBracket.newTree(), @[], passStructByPointer) + for x in devPtrsRes: + result.add( + check nnkCall.newTree( + ident"cuMemcpyDtoH", + maybeAddress x[1], + x[0], + csize_t getSizeOf(x[1]) + ) + ) + + # free memory + for x in devPtrs: + result.add( + check nnkCall.newTree( + ident"cuMemFree", + x[0] + ) + ) + result = quote do: + block: + `result` + +macro execCuda*(jitFn: CUfunction, + res: typed, + inputs: typed): untyped = + ## Given a CUDA function, execute the kernel. Copies all non trivial data types to + ## to the GPU via `cuMemcpyHtoD`. Any argument given as `res` will be copied back + ## from the GPU after kernel execution finishes. + ## + ## IMPORTANT: + ## The arguments passed to the CUDA kernel will be in the order in which they are + ## given to the macro. This especially means `res` arguments will be passed first. + ## + ## Example: + ## ```nim + ## execCuda(fn, res = [r, s], inputs = [a, b, c]) # if all arguments have the same type + ## # or + ## execCuda(fn, res = (r, s), inputs = (a, b, c)) # if different types + ## ``` + ## will pass the parameters as `[r, s, a, b, c]`. + ## + ## For more examples see the test case `tests/gpu/t_exec_literals_consts.nim`. + ## + ## We do not perform any checks on whether the given types are valid as arguments to + ## the CUDA target! Also, all arguments given as `res` are expected to be copied. + ## To return a value for a simple data type, use a `ptr X` type. However, it is allowed + ## to simply pass a `var` symbol as a `res` argument. We automatically copy to the + ## the memory location. + ## + ## We also copy all `res` data to the GPU, so that a return value can also be used + ## as an input. + ## + ## NOTE: This function is mainly intended for convenient execution of a single kernel + result = execCudaImpl(jitFn, newLit 1, newLit 1, res, inputs, newLit 0, passStructByPointer = false) + +macro execCuda*(jitFn: CUfunction, + numBlocks, threadsPerBlock: int, + res: typed, + inputs: typed): untyped = + ## Overload which takes a target number of threads and blocks + result = execCudaImpl(jitFn, numBlocks, threadsPerBlock, res, inputs, newLit 0, passStructByPointer = false) + +macro execCuda*(jitFn: CUfunction, + numBlocks, threadsPerBlock: int, + res: typed, + inputs: typed, + sharedMemSize: typed): untyped = + ## Overload which takes a target number of threads and blocks and a shared memory size + result = execCudaImpl(jitFn, numBlocks, threadsPerBlock, res, inputs, sharedMemSize, passStructByPointer = false) + +macro execCuda*(jitFn: CUfunction, + res: typed): untyped = + ## Overload of the above for empty `inputs` + result = execCudaImpl(jitFn, newLit 1, newLit 1, res, nnkBracket.newTree(), newLit 0, passStructByPointer = false) diff --git a/constantine/math_compiler/experimental/nim_ast_to_cuda_ast.nim b/constantine/math_compiler/experimental/nim_ast_to_cuda_ast.nim new file mode 100644 index 000000000..4521e4f68 --- /dev/null +++ b/constantine/math_compiler/experimental/nim_ast_to_cuda_ast.nim @@ -0,0 +1,1142 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import std / [macros, strutils, sequtils, options, sugar, tables, strformat] + +type + GpuNodeKind = enum + gpuVoid # Just an empty statement. Useful to not emit anything + gpuProc # Function definition (both device and global) + gpuCall # Function call + gpuTemplateCall # Call to a Nim template + gpuIf # If statement + gpuFor # For loop + gpuWhile # While loop + gpuBinOp # Binary operation + gpuVar # Variable declaration + gpuAssign # Assignment + gpuIdent # Identifier + gpuLit # Literal value + gpuArrayLit # Literal array constructor `[1, 2, 3]` + gpuPrefix # Prefix e.g. `-` + gpuBlock # Block of statements + gpuReturn # Return statement + gpuDot # Member access (a.b) + gpuIndex # Array indexing (a[b]) + gpuTypeDef # Type definition + gpuObjConstr # Object (struct) constructor + gpuInlineAsm # Inline assembly (PTX) + gpuAddr # Address of an expression + gpuDeref # Dereferences an expression + gpuCast # Cast expression + gpuComment # Just a comment + gpuConstexpr # A `constexpr`, i.e. compile time constant (Nim `const`) + + GpuTypeKind = enum + gtVoid, + gtBool, gtUint8, gtUint16, gtInt16, gtUint32, gtInt32, gtUint64, gtInt64, gtFloat32, gtFloat64, gtSize_t, # atomics + gtArray, # Static array `array[N, dtype]` -> `dtype[N]` + gtString, + gtObject, # Struct types + gtPtr, # Pointer type, carries inner type + gtVoidPtr # Opaque void pointer + + GpuTypeField = object + name: string + typ: GpuType + + GpuType = ref object + case kind: GpuTypeKind + of gtPtr: to: GpuType # points to `to` + of gtObject: + name: string + oFields: seq[GpuTypeField] + of gtArray: + aTyp: GpuType # the inner type (must be some atomic base type at the moment) + aLen: int # The length of the array. If `aLen == -1` we look at a generic (static) array. Will be given at instantiation time + else: discard + + GpuAttribute = enum + attDevice = "__device__" + attGlobal = "__global__" + attForceInline = "__forceinline__" + + GpuVarAttribute = enum + atvExtern = "extern" + atvShared = "__shared__" + atvVolatile = "volatile" + atvConstant = "__constant__" # use `{.constant.}` pragma, e.g. `var foo {.constant.}` + + GpuAst = ref object + case kind: GpuNodeKind + of gpuVoid: discard + of gpuProc: + pName: string + pRetType: GpuType + pParams: seq[tuple[name: string, typ: GpuType]] + pBody: GpuAst + pAttributes: set[GpuAttribute] # order not important, hence set + of gpuCall: + cName: string + cArgs: seq[GpuAst] + of gpuTemplateCall: + tcName: string + tcArgs: seq[GpuAst] # Arguments for template instantiation + of gpuIf: + ifCond: GpuAst + ifThen: GpuAst + ifElse: Option[GpuAst] # None if no else branch + of gpuFor: + fVar: string + fStart, fEnd: GpuAst + fBody: GpuAst + of gpuWhile: + wCond: GpuAst + wBody: GpuAst + of gpuBinOp: + bOp: string + bLeft, bRight: GpuAst + of gpuVar: + vName: string + vType: GpuType + vInit: GpuAst + vRequiresMemcpy: bool + vAttributes: seq[GpuVarAttribute] # order is important, hence seq + of gpuAssign: + aLeft, aRight: GpuAst + aRequiresMemcpy: bool + of gpuIdent: + iName: string + of gpuLit: + lValue: string + lType: GpuType + of gpuConstexpr: + cIdent: GpuAst # the identifier + cValue: GpuAst # not just a string to support different types easily + cType: GpuType + of gpuArrayLit: + aValues: seq[string] + aLitType: GpuType # type of first element + of gpuBlock: + blockLabel: string # optional name of the block. If any given, will open a `{ }` scope in CUDA + statements: seq[GpuAst] + of gpuReturn: + rValue: GpuAst + of gpuDot: + dParent: GpuAst + dField: GpuAst #string + of gpuIndex: + iArr: GpuAst + iIndex: GpuAst + of gpuPrefix: + pOp: string + pVal: GpuAst + of gpuTypeDef: + tName: string + tFields: seq[GpuTypeField] + of gpuObjConstr: + ocName: string # type we construct + ## XXX: it would be better if we already fill the fields with default values here + ocFields: seq[GpuFieldInit] # the fields we initialize + of gpuInlineAsm: + stmt: string + of gpuComment: + comment: string + of gpuCast: + cTo: GpuType # type to cast to + cExpr: GpuAst # expression we cast + of gpuAddr: + aOf: GpuAst + of gpuDeref: + dOf: GpuAst + + GpuFieldInit = object + name: string + value: GpuAst + + TemplateInfo = object + params: seq[string] + body: GpuAst + + GpuContext = object + ## XXX: need table for generic invocations. Then when we encounter a type, need to map to + ## the specific version + ## However, also need to keep every *generic procedure*. In their bodies the types are + ## only defined once they are called after all. + skipSemicolon: bool # whether we *currently* add semicolons at the end of a block or not + templates: Table[string, TemplateInfo] # Maps template names to their info + +template nimonly*(): untyped {.pragma.} +template cudaName*(s: string): untyped {.pragma.} + + +proc `$`(x: GpuType): string = + if x == nil: + result = "GpuType(nil)" + else: + result = $x[] + +proc nimToGpuType(n: NimNode): GpuType + +proc initGpuType(kind: GpuTypeKind): GpuType = + ## If `kind` is `gtPtr` `to` must be the type we point to + if kind in [gtObject, gtPtr, gtArray]: raiseAssert "Objects/Pointers/Arrays must be constructed using `initGpuPtr/Object/ArrayType` " + result = GpuType(kind: kind) + +proc initGpuPtrType(to: GpuType): GpuType = + ## If `kind` is `gtPtr` `to` must be the type we point to + result = GpuType(kind: gtPtr, to: to) + +proc initGpuVoidPtr(): GpuType = + result = GpuType(kind: gtVoidPtr) + +proc initGpuObjectType(name: string, flds: seq[GpuTypeField]): GpuType = + ## If `kind` is `gtPtr` `to` must be the type we point to + result = GpuType(kind: gtObject, name: name, oFields: flds) + +proc initGpuArrayType(aTyp: NimNode, len: int): GpuType = + ## Construct an statically sized array type + result = GpuType(kind: gtArray, aTyp: nimToGpuType(aTyp), aLen: len) + +proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind = + case t + #of ntyBool, ntyChar: + # , ntyEmpty, ntyAlias, ntyNil, ntyExpr, ntyStmt, ntyTypeDesc, ntyGenericInvocation, ntyGenericBody, ntyGenericInst, ntyGenericParam, ntyDistinct, ntyEnum, ntyOrdinal, ntyArray, ntyObject, ntyTuple, ntySet, ntyRange, ntyPtr, ntyRef, ntyVar, ntySequence, ntyProc, + #of ntyPointer, ntyUncheckedArray, ntyOpenArray, ntyString, ntyCString + # , ntyForward, ntyInt, ntyInt8, + of ntyBool: gtBool + of ntyInt16: gtInt16 + of ntyInt32: gtInt32 + of ntyInt64: gtInt64 + of ntyInt: gtInt64 + of ntyFloat: gtFloat64 + of ntyFloat32: gtFloat32 + of ntyFloat64: gtFloat64 + #of ntyFloat128: gtFloat128 + of ntyUInt: gtUint64 + of ntyUInt8: gtUint8 + of ntyUInt16: gtUint16 + of ntyUInt32: gtUint32 + of ntyUInt64: gtUint64 + else: + raiseAssert "Not supported yet: " & $t + +proc unpackGenericInst(t: NimNode): NimNode = + let tKind = t.typeKind + if tKind == ntyGenericInst: + let impl = t.getTypeImpl() + case impl.kind + of nnkDistinctTy: # just skip the distinct + result = impl[0] + else: + raiseAssert "Unsupport type so far: " & $t.treerepr & " of impl: " & $impl.treerepr + else: + result = t + +proc toGpuTypeKind(t: NimNode): GpuTypeKind = + result = t.unpackGenericInst().typeKind.toGpuTypeKind() + +proc getInnerPointerType(n: NimNode): GpuType = + doAssert n.typeKind in {ntyPtr, ntyPointer, ntyUncheckedArray, ntyVar} or n.kind == nnkPtrTy, "But was: " & $n.treerepr & " of typeKind " & $n.typeKind + if n.typeKind in {ntyPointer, ntyUncheckedArray}: + let typ = n.getTypeInst() + doAssert typ.kind == nnkBracketExpr, "No, was: " & $typ.treerepr + doAssert typ[0].kind in {nnkIdent, nnkSym} + doAssert typ[0].strVal in ["ptr", "UncheckedArray"] + result = nimToGpuType(typ[1]) + elif n.kind == nnkPtrTy: + result = nimToGpuType(n[0]) + elif n.kind == nnkAddr: + let typ = n.getTypeInst() + result = getInnerPointerType(typ) + elif n.kind == nnkVarTy: + # VarTy + # Sym "BigInt" + result = nimToGpuType(n[0]) + else: + raiseAssert "Found what: " & $n.treerepr + +proc determineArrayLength(n: NimNode): int = + case n[1].kind + of nnkSym: + # likely a constant, try to get its value + result = n[1].getImpl.intVal + of nnkIdent: + let msg = """Found array with length given by identifier: $#! +You might want to create a typed template taking a typed parameter for this +constant to force the Nim compiler to bind the symbol. +""" % n[1].strVal + raiseAssert msg + else: + case n[1].kind + of nnkIntLit: result = n[1].intVal + else: + #doAssert n[1].kind == nnkIntLit, "No is: " & $n.treerepr + doAssert n[1].kind == nnkInfix, "No is: " & $n.treerepr + doAssert n[1][1].kind == nnkIntLit, "No is: " & $n.treerepr + doAssert n[1][1].intVal == 0, "No is: " & $n.treerepr + result = n[1][2].intVal + 1 + +proc getTypeName(n: NimNode): string = + ## Returns the name of the type + case n.kind + of nnkIdent, nnkSym: result = n.strVal + of nnkObjConstr: result = n.getTypeInst.strVal + else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr + +proc parseTypeFields(node: NimNode): seq[GpuTypeField] +proc nimToGpuType(n: NimNode): GpuType = + ## Maps a Nim type to a type on the GPU + case n.kind + of nnkIdentDefs: # extract type for let / var based on explicit or implicit type + if n[n.len - 2].kind != nnkEmpty: # explicit type + result = nimToGpuType(n[n.len - 2]) + else: # take from last element + result = nimToGpuType(n[n.len - 1].getTypeInst()) + of nnkConstDef: + if n[1].kind != nnkEmpty: # has an explicit type + result = nimToGpuType(n[1]) + else: + result = nimToGpuType(n[2]) # derive from the RHS literal + else: + if n.kind == nnkEmpty: return initGpuType(gtVoid) + case n.typeKind + of ntyBool, ntyInt .. ntyUint64: # includes all float types + result = initGpuType(toGpuTypeKind n.typeKind) + of ntyPtr, ntyVar: + result = initGpuPtrType(getInnerPointerType(n)) + of ntyPointer: + result = initGpuVoidPtr() + of ntyUncheckedArray: + ## Note: this is just the internal type of the array. It is only a pointer due to + ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part. + result = getInnerPointerType(n) + of ntyObject: + let impl = n.getTypeImpl + let flds = impl.parseTypeFields() + let typName = getTypeName(n) # might be an object construction + result = initGpuObjectType(typName, flds) + of ntyArray: + # For a generic, static array type, e.g.: + if n.kind == nnkSym: + return nimToGpuType(getTypeImpl(n)) + if n.len == 3: + # BracketExpr + # Sym "array" + # Ident "N" + # Sym "uint32" + doAssert n.len == 3, "Length was not 3, but: " & $n.len & " for node: " & n.treerepr + doAssert n[0].strVal == "array" + let len = determineArrayLength(n) + result = initGpuArrayType(n[2], len) + else: + # just an array literal + # Bracket + # UIntLit 2013265921 + let len = n.len + result = initGpuArrayType(n[0], len) + #of ntyCompositeTypeClass: + # echo n.getTypeImpl.treerepr + # error("o") + of ntyGenericInvocation: + result = initGpuType(gtVoid) + error("Generics are not supported in the CUDA DSL so far.") + of ntyGenericInst: + result = n.unpackGenericInst().nimToGpuType() + else: raiseAssert "Type : " & $n.typeKind & " not supported yet: " & $n.treerepr + +proc assignOp(op: string, isBoolean: bool): string = + ## Returns the correct CUDA operation given the Nim operator. + ## This is to replace things like `shl`, `div` or `mod` + case op + of "div": result = "/" + of "mod": result = "%" + of "shl": result = "<<" + of "shr": result = ">>" + of "and": result = if isBoolean: "&&" else: "&" # bitwise OR + of "or": result = if isBoolean: "||" else: "|" # bitwise OR + of "xor": result = "^" + else: result = op + +proc assignPrefixOp(op: string): string = + ## Returns the correct CUDA operation given the Nim operator. + case op + of "not": result = "!" + else: result = op + +proc parseTypeFields(node: NimNode): seq[GpuTypeField] = + doAssert node.kind == nnkObjectTy + doAssert node[2].kind == nnkRecList + for ch in node[2]: + doAssert ch.kind == nnkIdentDefs and ch.len == 3 + result.add GpuTypeField(name: ch[0].strVal, + typ: nimToGpuType(ch[1])) + +template findIdx(col, el): untyped = + var res = -1 + for i, it in col: + if it.name == el: + res = i + break + res + +proc ensureBlock(ast: GpuAst): GpuAst = + ## Ensures the body is a block, e.g. if single statement in a for loop, we want the + ## body to be a block regardless. + if ast.kind == gpuBlock: ast + else: GpuAst(kind: gpuBlock, statements: @[ast]) + +proc requiresMemcpy(n: NimNode): bool = + ## At the moment we only emit a `memcpy` statement for array types + result = n.typeKind == ntyArray and n.kind != nnkBracket # need to emit a memcpy + +proc getFnName(n: NimNode): string = + ## Returns the name for the function. Either the symbol name _or_ + ## the `{.cudaName.}` pragma argument. + # check if the implementation has a pragma + if n.kind == nnkSym: + # Check if `cudaName` pragma used: + # ProcDef + # Sym "syncthreads" + # Empty + # Empty + # FormalParams + # Empty + # Pragma + # ExprColonExpr + # Sym "cudaName" <- if this exists + # StrLit "__syncthreads" <- use this name + # Empty + # DiscardStmt + # Empty + let impl = n.getImpl + if impl.kind in [nnkProcDef, nnkFuncDef]: + let pragma = impl.pragma + if pragma.kind != nnkEmpty and pragma[0].kind == nnkExprColonExpr: + if pragma[0][0].kind in [nnkIdent, nnkSym] and pragma[0][0].strVal == "cudaName": + return pragma[0][1].strVal # return early to avoid lots of branches + # else we use the str representation (repr for open / closed sym choice nodes) + result = n.repr + + + +proc collectProcAttributes(n: NimNode): set[GpuAttribute] = + doAssert n.kind == nnkPragma + for pragma in n: + doAssert pragma.kind in [nnkIdent, nnkSym], "Unexpected node kind: " & $pragma.treerepr + case pragma.strVal + of "device": result.incl attDevice + of "global": result.incl attGlobal + of "forceinline": result.incl attForceInline + of "nimonly": + # used to fully ignore functions! + return + else: + raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr + +proc collectAttributes(n: NimNode): seq[GpuVarAttribute] = + ## Collects all pragmas associated with the given variable. + ## Takes the `nnkPragma` node of the `nnkIdentDefs` associated with it. + # Example AST with multiple pragmas + # IdentDefs + # PragmaExpr + # Sym "sharedMem" + # Pragma + # Sym "cuExtern" + # Sym "shared" + # BracketExpr + # Sym "array" + # IntLit 0 + # Sym "BigInt" + # Empty + doAssert n.kind == nnkPragma + for pragma in n: + doAssert pragma.kind in [nnkIdent, nnkSym], "Unexpected node kind: " & $pragma.treerepr + # NOTE: We don't use `parseEnum`, because on the Nim side some of the attributes + # do not match the CUDA string we need to emit, which is what the string value of + # the `GpuVarAttribute` enum stores + case pragma.strVal + of "cuExtern", "extern": result.add atvExtern + of "shared": result.add atvShared + of "volatile": result.add atvVolatile + of "constant": result.add atvConstant + else: + raiseAssert "Unexpected pragma: " & $pragma.treerepr + +proc toGpuAst(ctx: var GpuContext, node: NimNode): GpuAst = + ## XXX: things still left to do: + ## - support `result` variable? Currently not supported. Maybe we will won't + + #echo node.treerepr + case node.kind + of nnkEmpty: result = GpuAst(kind: gpuVoid) # nothing to do + of nnkStmtList: + result = GpuAst(kind: gpuBlock) + for el in node: + result.statements.add ctx.toGpuAst(el) + of nnkBlockStmt: + # BlockStmt + # Sym "unrolledIter_i0" <- ignore the block label for now! + # Call + # Sym "printf" + # StrLit "i = %u\n" + # IntLit 0 + let blockLabel = if node[0].kind in {nnkSym, nnkIdent}: node[0].strVal + elif node[0].kind == nnkEmpty: "" + else: raiseAssert "Unexpected node in block label field: " & $node.treerepr + result = GpuAst(kind: gpuBlock, + blockLabel: blockLabel) + for i in 1 ..< node.len: # index 0 is the block label + result.statements.add ctx.toGpuAst(node[i]) + of nnkStmtListExpr: # for statements that return a value. + ## XXX: For CUDA just a block? + result = GpuAst(kind: gpuBlock) + for el in node: + if el.kind != nnkEmpty: + result.statements.add ctx.toGpuAst(el) + of nnkDiscardStmt: + # just process the child node if any + result = ctx.toGpuAst(node[0]) + + of nnkProcDef, nnkFuncDef: + result = GpuAst(kind: gpuProc) + result.pName = node.name.strVal + doAssert node[3].kind == nnkFormalParams + result.pRetType = nimToGpuType(node[3][0]) # arg 0 is return type + # Process parameters + for i in 1 ..< node[3].len: + let param = node[3][i] + let numParams = param.len - 2 # 3 if one param, one more for each of same type, example: + let typIdx = param.len - 2 # second to last is the type + # IdentDefs + # Ident "x" + # Ident "y" + # Ident "res" + # PtrTy + # Ident "float32" # `param.len - 2` + # Empty # `param.len - 1` + let paramType = nimToGpuType(param[typIdx]) + for i in 0 ..< numParams: + result.pParams.add((param[i].strVal, paramType)) + + # Process pragmas + if node.pragma.kind != nnkEmpty: + doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?" + result.pAttributes = collectProcAttributes(node.pragma) + if result.pAttributes.len == 0: # means `nimonly` was applied + return GpuAst(kind: gpuVoid) + + result.pBody = ctx.toGpuAst(node.body) + .ensureBlock() # single line procs should be a block to generate `;` + + of nnkLetSection, nnkVarSection: + # For a section with multiple declarations, create a block + result = GpuAst(kind: gpuBlock) + for declaration in node: + # Each declaration gets converted to a gpuVar + var varNode = GpuAst(kind: gpuVar) + case declaration[0].kind + of nnkIdent, nnkSym: + # IdentDefs # declaration + # Sym "res" # declaration[0] + # Sym "uint32" + # Empty + varNode.vName = declaration[0].strVal + of nnkPragmaExpr: + # IdentDefs # declaration + # PragmaExpr # declaration[0] + # Sym "res" # declaration[0][0] + # Pragma # declaration[0][1] + # Ident "volatile" + # Sym "uint32" + # Empty + varNode.vName = declaration[0][0].strVal + doAssert declaration[0][1].kind == nnkPragma + varNode.vAttributes = collectAttributes(declaration[0][1]) + else: raiseAssert "Unexpected node kind for variable: " & $declaration.treeRepr + varNode.vType = nimToGpuType(declaration) + ## XXX: handle initialization for array types. Need a memcpy! + ## In principle should be straightforward. Turn e.g. + ## ```nim + ## let someData: array[8, uint32] = foo() + ## let x = BigInt(limbs: someData) + ## ``` + ## into + ## ```cuda + ## unsigned int someData[8] = foo(); + ## BigInt x = {{}}; + ## memcpy((&x.limbs), (&someData), sizeof(unsigned int) * 8); + ## ``` + ## Or something along those lines. + if declaration.len > 2 and declaration[2].kind != nnkEmpty: # Has initialization + varNode.vInit = ctx.toGpuAst(declaration[2]) + varNode.vRequiresMemcpy = requiresMemcpy(declaration[2]) + result.statements.add(varNode) + + of nnkAsgn: + result = GpuAst(kind: gpuAssign) + result.aLeft = ctx.toGpuAst(node[0]) + result.aRight = ctx.toGpuAst(node[1]) + result.aRequiresMemcpy = requiresMemcpy(node[1]) + + of nnkIfStmt: + result = GpuAst(kind: gpuIf) + let branch = node[0] # First branch + result.ifCond = ctx.toGpuAst(branch[0]) + result.ifThen = ensureBlock ctx.toGpuAst(branch[1]) + if node.len > 1 and node[^1].kind == nnkElse: + result.ifElse = some(ensureBlock ctx.toGpuAst(node[^1][0])) + + of nnkForStmt: + result = GpuAst(kind: gpuFor) + result.fVar = node[0].strVal + # Assuming range expression + result.fStart = ctx.toGpuAst(node[1][1]) + result.fEnd = ctx.toGpuAst(node[1][2]) + result.fBody = ensureBlock ctx.toGpuAst(node[2]) + of nnkWhileStmt: + result = GpuAst(kind: gpuWhile) + result.wCond = ctx.toGpuAst(node[0]) # the condition + result.wBody = ensureBlock ctx.toGpuAst(node[1]) + + of nnkTemplateDef: + ## NOTE: Currently we process templates, but we expect them to be already + ## expanded by the Nim compiler. Thus we could in theory expand them manually + ## but fortunately we don't need to. + let tName = node[0].strVal + + # Extract parameters + var tParams = newSeq[string]() + for i in 1 ..< node[3].len: + let param = node[3][i] + tParams.add param[0].strVal + # and the body + let tBody = ctx.toGpuAst(node.body) + + # Store template in context + ctx.templates[tName] = TemplateInfo( + params: tParams, + body: tBody + ) + + result = GpuAst(kind: gpuVoid) + + of nnkCall, nnkCommand: + # Check if this is a template call + let name = getFnName(node[0]) # cannot use `strVal`, might be a symchoice + let args = node[1..^1].mapIt(ctx.toGpuAst(it)) + # Producing a template call something like this (but problematic due to overloads etc) + # we could then perform manual replacement of the template in the CUDA generation pass. + if false: # name in ctx.templates: # + result = GpuAst(kind: gpuTemplateCall) + result.tcName = name + result.tcArgs = args + else: + result = GpuAst(kind: gpuCall) + result.cName = name + result.cArgs = args + + of nnkInfix: + result = GpuAst(kind: gpuBinOp) + # if left/right is boolean we need logical AND/OR, otherwise + # bitwise + let isBoolean = node[1].typeKind == ntyBool + result.bOp = assignOp(node[0].repr, isBoolean) # repr so that open sym choice gets correct name + result.bLeft = ctx.toGpuAst(node[1]) + result.bRight = ctx.toGpuAst(node[2]) + + of nnkDotExpr: + result = GpuAst(kind: gpuDot) + result.dParent = ctx.toGpuAst(node[0]) + result.dField = ctx.toGpuAst(node[1]) + + of nnkBracketExpr: + result = GpuAst(kind: gpuIndex) + result.iArr = ctx.toGpuAst(node[0]) + result.iIndex = ctx.toGpuAst(node[1]) + + of nnkIdent, nnkSym, nnkOpenSymChoice: + result = GpuAst(kind: gpuIdent) + result.iName = node.repr # for sym choices + + # literal types + of nnkIntLit, nnkInt32Lit: + result = GpuAst(kind: gpuLit) + result.lValue = $node.intVal + result.lType = initGpuType(gtInt32) + of nnkUInt32Lit: + result = GpuAst(kind: gpuLit) + result.lValue = $node.intVal + result.lType = initGpuType(gtUInt32) + of nnkFloat64Lit, nnkFloatLit: + result = GpuAst(kind: gpuLit) + result.lValue = $node.floatVal & "f" + result.lType = initGpuType(gtFloat64) + of nnkFloat32Lit: + result = GpuAst(kind: gpuLit) + result.lValue = $node.floatVal & "f" + result.lType = initGpuType(gtFloat32) + of nnkRStrLit: + result = GpuAst(kind: gpuLit) + result.lValue = node.strVal + result.lType = initGpuType(gtString) + of nnkStrLit: + # For regular string literals escape them (but don't prefix/suffix with `"`) + result = GpuAst(kind: gpuLit) + result.lValue = node.strVal.escape("", "") + result.lType = initGpuType(gtString) + of nnkNilLit: + result = GpuAst(kind: gpuLit) + result.lValue = "NULL" + result.lType = initGpuVoidPtr() + + of nnkPar: + if node.len == 1: # just take body + result = ctx.toGpuAst(node[0]) + else: + error("`nnkPar` with more than one argument currently not supported. Got: " & $node.treerepr) + + of nnkReturnStmt: + if node[0].kind == nnkAsgn and node[0][0].strVal == "result": + # skip the result and just get the RHS + result = GpuAst(kind: gpuReturn, + rValue: ctx.toGpuAst(node[0][1])) + else: + result = GpuAst(kind: gpuReturn, + rValue: ctx.toGpuAst(node[0])) + + of nnkPrefix: + result = GpuAst(kind: gpuPrefix, + pVal: ctx.toGpuAst(node[1])) + result.pOp = assignPrefixOp(node[0].strVal) + + of nnkTypeSection: + result = GpuAst(kind: gpuBlock) + for el in node: # walk each type def + doAssert el.kind == nnkTypeDef + result.statements.add ctx.toGpuAst(el) + of nnkTypeDef: + result = GpuAst(kind: gpuTypeDef, tName: node[0].strVal) + result.tFields = parseTypeFields(node[2]) + of nnkObjConstr: + let typName = getTypeName(node) + result = GpuAst(kind: gpuObjConstr, ocName: typName) + # get all fields of the type + let flds = node[0].getTypeImpl.parseTypeFields() # sym + # find all fields that have been defined by the user + var ocFields: seq[GpuFieldInit] + for i in 1 ..< node.len: # all fields to be init'd + doAssert node[i].kind == nnkExprColonExpr + ocFields.add GpuFieldInit(name: node[i][0].strVal, + value: ctx.toGpuAst(node[i][1])) + # now add fields in order of the type declaration + for i in 0 ..< flds.len: + let idx = findIdx(ocFields, flds[i].name) + if idx >= 0: + result.ocFields.add ocFields[idx] + else: + let dfl = GpuAst(kind: gpuLit, lValue: "DEFAULT", lType: GpuType(kind: gtVoid)) + result.ocFields.add GpuFieldInit(name: flds[i].name, + value: dfl) + + of nnkAsmStmt: + doAssert node.len == 2 + doAssert node[0].kind == nnkEmpty + result = GpuAst(kind: gpuInlineAsm, + stmt: node[1].strVal) + + of nnkBracket: + let aLitTyp = nimToGpuType(node[0]) + var aValues = newSeq[string]() + for el in node: + ## XXX: do not use `repr`, e.g. if `1'u32` we'll get the `'u32` suffix + aValues.add $el.intVal + result = GpuAst(kind: gpuArrayLit, + aValues: aValues, + aLitType: aLitTyp) + + of nnkCommentStmt: + result = GpuAst(kind: gpuComment, comment: node.strVal) + + of nnkHiddenStdConv: + doAssert node[0].kind == nnkEmpty + result = ctx.toGpuAst(node[1]) + of nnkCast, nnkConv: + # also map type conversion, e.g. `let i: int = 5; i.uint32` to a cast + result = GpuAst(kind: gpuCast, cTo: nimToGpuType(node[0]), cExpr: ctx.toGpuAst(node[1])) + + of nnkAddr, nnkHiddenAddr: + # `HiddenAddr` appears for accesses to `var` passed arguments + result = GpuAst(kind: gpuAddr, aOf: ctx.toGpuAst(node[0])) + + of nnkHiddenDeref: + case node.typeKind + of ntyUncheckedArray: + # `getTypeInst(node)` would yield: + # BracketExpr + # Sym "UncheckedArray" + # Sym "uint32" + # i.e. it is a `ptr UncheckedArray[T]` + # In this case we just ignore the deref, because on the CUDA + # side it is just a plain pointer array we index into using + # `foo[i]`. + result = ctx.toGpuAst(node[0]) + else: + # Otherwise we treat it like a regular deref + # HiddenDeref + # Sym "x" + # With e.g. `getTypeInst(node) = Sym "BigInt"` + # and `node.typeKind = ntyObject` + # due to a `var` parameter + result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0])) + of nnkDerefExpr: #, nnkHiddenDeref: + result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0])) + + of nnkConstDef: + result = GpuAst(kind: gpuConstexpr, + cIdent: ctx.toGpuAst(node[0]), + cValue: ctx.toGpuAst(node[2]), + cType: nimToGpuType(node)) + of nnkConstSection: + result = GpuAst(kind: gpuBlock) + for el in node: # walk each type def + doAssert el.kind == nnkConstDef + result.statements.add ctx.toGpuAst(el) + + else: + echo "Unhandled node kind in toGpuAst: ", node.kind + raiseAssert "Unhandled node kind in toGpuAst: " & $node.treerepr + result = GpuAst(kind: gpuBlock) + +proc gpuTypeToString(t: GpuTypeKind): string = + case t + of gtBool: "bool" + of gtUint8: "unsigned char" + of gtUint16: "unsigned short" + of gtUint32: "unsigned int" + of gtUint64: "unsigned long long" + of gtInt16: "short" + of gtInt32: "int" + of gtInt64: "long long" + of gtFloat32: "float" + of gtFloat64: "double" + of gtVoid: "void" + of gtSize_t: "size_t" + of gtPtr: "*" + of gtVoidPtr: "void*" + of gtObject: "struct" + of gtString: "const char*" + else: + raiseAssert "Invalid type : " & $t + + +proc gpuTypeToString(t: GpuType, ident: string = "", allowArrayToPtr = false, allowEmptyIdent = false): string +proc getInnerArrayType(t: GpuType): string = + ## Returns the name of the inner most type for a nested array. + case t.kind + of gtArray: + result = getInnerArrayType(t.aTyp) + else: + result = gpuTypeToString(t) + +proc getInnerArrayLengths(t: GpuType): string = + ## Returns the lengths of the inner array types for a nested array. + case t.kind + of gtArray: + let inner = getInnerArrayLengths(t.aTyp) + result = &"[{$t.aLen}]" + if inner.len > 0: + result.add &"{inner}" + else: + result = "" + +proc gpuTypeToString(t: GpuType, ident: string = "", allowArrayToPtr = false, + allowEmptyIdent = false, + ): string = + ## Given an optional identifier required for array types + ## + ## XXX: we don't support this at the moment, it occured to me as something that + ## could be useful sometimes... + ## If `allowArrayToPtr` we allow casting a statically sized array to a pointer + var skipIdent = false + case t.kind + of gtPtr: + if t.to.kind == gtArray: # ptr to array type + # need to pass `*` for the pointer into the identifier, i.e. + # `state: var array[4, BigInt]` + # must become + # `BigInt (*state)[4]` + # so as our ident we pass `theIdent = (*)` and generate the type for the internal + # array type, which yields e.g. `BigInt [4]`. + let ptrStar = gpuTypeToString(t.kind) + result = gpuTypeToString(t.to, "(" & ptrStar & ident & ")") + skipIdent = true + else: + let typ = gpuTypeToString(t.to, allowEmptyIdent = allowEmptyIdent) + let ptrStar = gpuTypeToString(t.kind) + result = typ & ptrStar + of gtArray: + # empty idents happen in e.g. function return types or casts + if ident.len == 0 and not allowEmptyIdent: # and not allowArrayToPtr: + error("Invalid call, got an array type but don't have an identifier: " & $t) + case t.aTyp.kind + of gtArray: # nested array + let typ = getInnerArrayType(t) # get inner most type + let lengths = getInnerArrayLengths(t) # get lengths as `[X][Y][Z]...` + result = typ & " " & ident & lengths + else: + # NOTE: Nested arrays don't have an inner identifier! + if t.aLen == 0: ## XXX: for the moment for 0 length arrays we generate flexible arrays instead + result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[]" + else: + result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[" & $t.aLen & "]" + skipIdent = true + of gtObject: result = t.name + else: result = gpuTypeToString(t.kind) + + if ident.len > 0 and not skipIdent: # still need to add ident + result.add " " & ident + +proc genFunctionType(typ: GpuType, fn: string, fnArgs: string): string = + ## Returns the correct function with its return type + if typ.kind == gtPtr and typ.to.kind == gtArray: + # crazy stuff. Syntax to return a pointer to a statically sized array: + # `Foo (*fnName(fnArgs))[ArrayLen]` + # where the return type is actually: + # `Foo (*)[ArrayLen]` (which already is hideous) + let arrayTyp = typ.to.aTyp + let innerTyp = gpuTypeToString(arrayTyp, allowEmptyIdent = true) + let innerLen = $typ.to.aLen + result = &"{innerTyp} (*{fn}({fnArgs}))[{innerLen}]" + else: + # normal stuff + result = &"{gpuTypeToString(typ, allowEmptyIdent = true)} {fn}({fnArgs})" + +proc genCuda(ctx: var GpuContext, ast: GpuAst, indent = 0): string + +proc address(a: string): string = "&" & a +proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genCuda(a)) + +proc size(a: string): string = "sizeof(" & a & ")" +proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genCuda(a)) +proc size(ctx: var GpuContext, a: GpuType): string = size(gpuTypeToString(a, allowEmptyIdent = true)) + +proc genMemcpy(lhs, rhs, size: string): string = + result = &"memcpy({lhs}, {rhs}, {size})" + +template withoutSemicolon(ctx: var GpuContext, body: untyped): untyped = + ctx.skipSemicolon = true + body + ctx.skipSemicolon = false + +proc genCuda(ctx: var GpuContext, ast: GpuAst, indent = 0): string = + let indentStr = " ".repeat(indent) + #echo "At: ", ast.repr, " SKIP SEMICOLON: ", ctx.skipSemicolon + + case ast.kind + of gpuVoid: return # nothing to emit + of gpuProc: + let attrs = collect: + for att in ast.pAttributes: + $att + + # Parameters + var params: seq[string] + for (name, typ) in ast.pParams: + params.add gpuTypeToString(typ, name, allowEmptyIdent = false) + let fnArgs = params.join(", ") + let fnSig = genFunctionType(ast.pRetType, ast.pName, fnArgs) + + # extern "C" is needed to avoid name mangling + result = indentStr & "extern \"C\" " & attrs.join(" ") & " " & + fnSig & "{\n" + + result &= ctx.genCuda(ast.pBody, indent + 1) + result &= "\n" & indentStr & "}" + + of gpuBlock: + result = "" + if ast.blockLabel.len > 0: + result.add "\n" & indentStr & "{ // " & ast.blockLabel & "\n" + for i, el in ast.statements: + result.add ctx.genCuda(el, indent) + if el.kind != gpuBlock and not ctx.skipSemicolon: # nested block ⇒ ; already added + result.add ";" + if i < ast.statements.high: + result.add "\n" + if ast.blockLabel.len > 0: + result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n" + + of gpuVar: + result = indentStr & ast.vAttributes.join(" ") & " " & gpuTypeToString(ast.vType, ast.vName) + # If there is an initialization, the type might require a memcpy + if ast.vInit != nil and not ast.vRequiresMemcpy: + result &= " = " & ctx.genCuda(ast.vInit) + elif ast.vInit != nil: + result.add ";\n" + result.add indentStr & genMemcpy(address(ast.vName), ctx.address(ast.vInit), + size(ast.vName)) + + of gpuAssign: + if ast.aRequiresMemcpy: + result = indentStr & genMemcpy(ctx.address(ast.aLeft), ctx.address(ast.aRight), + ctx.size(ast.aLeft)) + else: + result = indentStr & ctx.genCuda(ast.aLeft) & " = " & ctx.genCuda(ast.aRight) + + of gpuIf: + # skip semicolon in the condition. Otherwise can lead to problematic code + ctx.withoutSemicolon: # skip semicolon for if bodies + result = indentStr & "if (" & ctx.genCuda(ast.ifCond) & ") {\n" + result &= ctx.genCuda(ast.ifThen, indent + 1) & "\n" + result &= indentStr & "}" + if ast.ifElse.isSome: + result &= " else {\n" + result &= ctx.genCuda(ast.ifElse.get, indent + 1) & "\n" + result &= indentStr & "}" + + of gpuFor: + result = indentStr & "for(int " & ast.fVar & " = " & + ctx.genCuda(ast.fStart) & "; " & + ast.fVar & " < " & ctx.genCuda(ast.fEnd) & "; " & + ast.fVar & "++) {\n" + result &= ctx.genCuda(ast.fBody, indent + 1) & "\n" + result &= indentStr & "}" + of gpuWhile: + ctx.withoutSemicolon: + result = indentStr & "while (" & ctx.genCuda(ast.wCond) & "){\n" + result &= ctx.genCuda(ast.wBody, indent + 1) & "\n" + result &= indentStr & "}" + + of gpuDot: + result = ctx.genCuda(ast.dParent) & "." & ctx.genCuda(ast.dField) + + of gpuIndex: + result = ctx.genCuda(ast.iArr) & "[" & ctx.genCuda(ast.iIndex) & "]" + + of gpuCall: + result = indentStr & ast.cName & "(" & + ast.cArgs.mapIt(ctx.genCuda(it)).join(", ") & ")" + + of gpuTemplateCall: + error("Template calls are not supported at the moment. In theory there shouldn't even _be_ any template " & + "calls in the expanded body of the `cuda` macro.") + when false: # Template replacement would look something like this: + let templ = ctx.templates[ast.tcName] + let expandedBody = substituteTemplateArgs( + templ.body, + templ.params, + ast.tcArgs + ) + result = ctx.genCuda(expandedBody, indent) + + of gpuBinOp: + result = indentStr & "(" & ctx.genCuda(ast.bLeft) & " " & + ast.bOp & " " & + ctx.genCuda(ast.bRight) & ")" + + of gpuIdent: + result = ast.iName + + of gpuLit: + if ast.lType.kind == gtString: result = "\"" & ast.lValue & "\"" + elif ast.lValue == "DEFAULT": result = "{}" # default initialization, `DEFAULT` placeholder + else: result = ast.lValue + + of gpuArrayLit: + result = "{" + for i, el in ast.aValues: + result.add "(" & gpuTypeToString(ast.aLitType) & ")" & el + if i < ast.aValues.high: + result.add ", " + result.add "}" + + of gpuReturn: + result = indentStr & "return " & ctx.genCuda(ast.rValue) + + of gpuPrefix: + result = ast.pOp & ctx.genCuda(ast.pVal) + + of gpuTypeDef: + result = "struct " & ast.tName & "{\n" + for el in ast.tFields: + result.add " " & gpuTypeToString(el.typ, el.name) & ";\n" + result.add "}" + + of gpuObjConstr: + result = "{" + for i, el in ast.ocFields: + result.add ctx.genCuda(el.value) + if i < ast.ocFields.len - 1: + result.add ", " + result.add "}" + + of gpuInlineAsm: + result = indentStr & "asm(" & ast.stmt.strip & ");" + + of gpuComment: + result = indentStr & "/* " & ast.comment & " */" + + of gpuCast: + result = "(" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ")" & ctx.genCuda(ast.cExpr) + + of gpuAddr: + result = "(&" & ctx.genCuda(ast.aOf) & ")" + + of gpuDeref: + result = "(*" & ctx.genCuda(ast.dOf) & ")" + + of gpuConstexpr: + if ast.cType.kind == gtArray: + result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue) + else: + result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " " & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue) + + else: + echo "Unhandled node kind in genCuda: ", ast.kind + raiseAssert "Unhandled node kind in genCuda: " & ast.repr + result = "" + +macro cuda*(body: typed): string = + ## WARNING: The following are *not* supported: + ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between + ## what is a field access and a function call. Hence we assume any `nnkDotExpr` + ## is actually a field access! + ## - most regular Nim features :) + var ctx = GpuContext() + let gpuAst = ctx.toGpuAst(body) + ## NOTE: `header` is currently unused. Not sure yet if we'll ever need it. + let header = """ +// #include "foo.h" +""" + + let body = ctx.genCuda(gpuAst) + result = newLit(header & body) + +when isMainModule: + # Mini example + let kernel = cuda: + proc square(x: float32): float32 {.device.} = + if x < 0.0'f32: + result = 0.0'f32 + else: + result = x * x + + proc computeSquares( + output: ptr float32, + input: ptr float32, + n: int32 + ) {.global.} = + let idx: uint32 = blockIdx.x * blockDim.x + threadIdx.x + if idx < n: + var temp: float32 = 0.0'f32 + for i in 0..<4: + temp += square(input[idx + i * n]) + output[idx] = temp + + echo kernel diff --git a/constantine/math_compiler/experimental/nvrtc_field_ops.nim b/constantine/math_compiler/experimental/nvrtc_field_ops.nim new file mode 100644 index 000000000..b13744cb5 --- /dev/null +++ b/constantine/math_compiler/experimental/nvrtc_field_ops.nim @@ -0,0 +1,617 @@ +import std / strformat + +import + # Internal + constantine/named/algebras, + constantine/math/io/[io_bigints, io_fields], + constantine/math/arithmetic, + constantine/platforms/abstractions {.all.}, + constantine/platforms/abis/nvidia_abi, + constantine/math_compiler/experimental/runtime_compile, + constantine/serialization/io_limbs, + constantine/named/deriv/precompute + +import constantine/platforms/abstractions +export negInvModWord + +import std / macros +macro asm_comment*(msg: typed): untyped = + var msgLit = nnkTripleStrLit.newNimNode() + msgLit.strVal = "\"// " & msg.strVal & "\"" + result = nnkAsmStmt.newTree(newEmptyNode(), msgLit) + +template bigintToUint32Limbs*(b: typed): untyped = + let limbs = b.limbs + when CTT_32: + var res = default(array[b.limbs.len, uint32]) + for i in 0 ..< limbs.len: + res[i] = limbs[i].uint32 + else: + {.error: "Logic to convert 64 bit limbs to 32 bit limbs at compile time still unfinished.".} + # need twice as many limbs to go from 64bit to 32bit + ## XXX: Use number of bits required to check if the + ## last limbs needs to be dropped + var res = default(array[b.limbs.len * 2, uint32]) + for i in 0 ..< b.limbs.len: + res[i*2] = limbs[i].uint32 + res[i*2 + 1] = (limbs[i] shr 32).uint32 + res + +template defBigInt*(N: typed): untyped {.dirty.} = + # Utility for add with carry operations + type + BigInt = object + limbs: array[N, uint32] + template `[]`(x: BigInt, idx: int): untyped = x.limbs[idx] + template `[]=`(x: BigInt, idx: int, val: uint32): untyped = x.limbs[idx] = val + template `[]`(x: ptr BigInt, idx: int): untyped = x[].limbs[idx] + template `[]=`(x: ptr BigInt, idx: int, val: uint32): untyped = x[].limbs[idx] = val + + template len(x: BigInt): int = N + +template defPtxHelpers*(): untyped {.dirty.} = + ## Note: the below would just be generated from a macro of course, similar to + ## `constantine/platforms/llvm/asm_nvidia.nim`. + + ## IMPORTANT NOTE: For the below procs that define inline PTX statements: + ## It is very important (in the current implementation) that each of the + ## return values is marked `{.volatile.}` so that the NVRTC compiler does not + ## eliminate any of the function calls. Despite them being `__forceinline__`, + ## it might do such a thing if the return value is not used. + + proc add_cio(a, b: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"addc.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc add_ci(a, b: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"addc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc add_co(a, b: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"add.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc sub_bo(a, b: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"sub.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc sub_bi(a, b: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"subc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc sub_bio(a, b: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"subc.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc slct(a, b: uint32, pred: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 +# "slct.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(pred) + asm """ +"slct.u32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(pred) +""" + return res + + proc mul_lo(a, b: uint32): uint32 {.device, forceinline.} = + ## Returns the lower 32 bit of the uint32 multiplication, i.e. + ## behaves as unsigned multiplication modulo 2^32 (matches LLVM `mul`). + var res {.volatile.}: uint32 + asm """ +"mul.lo.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc mul_hi(a, b: uint32): uint32 {.device, forceinline.} = + ## Returns the upper 32 bit of the uint32 multiplication + var res {.volatile.}: uint32 + asm """ +"mul.hi.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + + # r <- a * b + c + proc mulloadd(a, b, c: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"mad.lo.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c) +""" + return res + + proc mulloadd_co(a, b, c: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c) +""" + return res + + proc mulloadd_ci(a, b, c: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"madc.lo.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c) +""" + return res + + proc mulloadd_cio(a, b, c: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c) +""" + return res + + # r <- (a * b) >> 32 + c + # r <- (a * b) >> 64 + c + proc mulhiadd(a, b, c: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"mad.hi.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c) +""" + return res + + proc mulhiadd_co(a, b, c: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c) +""" + return res + + proc mulhiadd_ci(a, b, c: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"madc.hi.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c) +""" + return res + + proc mulhiadd_cio(a, b, c: uint32): uint32 {.device, forceinline.} = + var res {.volatile.}: uint32 + asm """ +"madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c) +""" + return res + +template defCoreFieldOps*(T: typed): untyped {.dirty.} = + # Need to get the limbs & spare bits data in a static context + template getM0ninv(): untyped = static: T.getModulus().negInvModWord().uint32 + template spareBits(): untyped = static: (BigInt().limbs.len * WordSize - T.bits()) + + ## TODO: avoid the explicit array size here + proc toBigInt(limbs: array[1, uint32]): BigInt {.nimonly.} = + result.limbs = limbs + + const M = toBigInt(bigintToUint32Limbs(T.getModulus)) + const MontyOne = toBigInt(bigintToUint32Limbs(T.getMontyOne)) + const PP1D2 = toBigInt(bigintToUint32Limbs(T.getPrimePlus1div2)) + const M0NInv = getM0ninv() + + proc finalSubMayOverflow(a, M: BigInt): BigInt {.device.} = + ## If a >= Modulus: r <- a-M + ## else: r <- a + ## + ## This is constant-time straightline code. + ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU. + ## + ## To be used when the final substraction can + ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256) + var scratch: BigInt = BigInt() + + # Contains 0x0001 (if overflowed limbs) or 0x0000 + let overflowedLimbs = add_ci(0'u32, 0'u32) + + # Now substract the modulus, and test a < M with the last borrow + scratch[0] = sub_bo(a[0], M[0]) + staticFor i, 1, N: + scratch[i] = sub_bio(a[i], M[i]) + + # 1. if `overflowedLimbs`, underflowedModulus >= 0 + # 2. if a >= M, underflowedModulus >= 0 + # if underflowedModulus >= 0: a-M else: a + # TODO: predicated mov instead? + ## TODO: Fix this. `slct` needs a negative value for the else branch + let underflowedModulus = sub_bi(overflowedLimbs, 0'u32) + + var r: BigInt = BigInt() + staticFor i, 0, N: + r[i] = slct(scratch[i], a[i], underflowedModulus.int32) + return r + + proc finalSubNoOverflow(a, M: BigInt): BigInt {.device.} = + ## If a >= Modulus: r <- a-M + ## else: r <- a + ## + ## This is constant-time straightline code. + ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU. + ## + ## To be used when the modulus does not use the full bitwidth of the storing words + ## (say using 255 bits for the modulus out of 256 available in words) + var scratch: BigInt = BigInt() + + # Now substract the modulus, and test a < M with the last borrow + scratch[0] = sub_bo(a[0], M[0]) + staticFor i, 1, N: + scratch[i] = sub_bio(a[i], M[i]) + + # If it underflows here, `a` was smaller than the modulus, which is what we want + ## TODO: Fix this. `slct` needs a negative value for the else branch + let underflowedModulus = sub_bi(0'u32, 0'u32) + + var r: BigInt = BigInt() + staticFor i, 0, N: + r[i] = slct(scratch[i], a[i], underflowedModulus.int32) + return r + + proc modadd(a, b, M: BigInt): BigInt {.device.} = + ## Generate an optimized modular addition kernel + ## with parameters `a, b, modulus: Limbs -> Limbs` + # try to add two bigints + var t = BigInt() # temporary + + t[0] = add_co(a[0], b[0]) + staticFor i, 1, N: + t[i] = add_cio(a[i], b[i]) + + # can use `when` of course! + when spareBits() >= 1: # if spareBits() >= 1: # would also work + t = finalSubNoOverflow(t, M) + else: + t = finalSubMayOverflow(t, M) + + return t + + proc modsub(a, b, M: BigInt): BigInt {.device.} = + ## Generate an optimized modular substraction kernel + ## with parameters `a, b, modulus: Limbs -> Limbs` + var t = BigInt() + + t[0] = sub_bo(a[0], b[0]) + staticFor i, 1, a.len: + t[i] = sub_bio(a[i], b[i]) + + let underflowMask = sub_bi(0'u32, 0'u32) + + # If underflow + # TODO: predicated mov instead? + var maskedM: BigInt = BigInt() + staticFor i, 0, N: + maskedM[i] = M[i] and underflowMask + + t[0] = add_co(t[0], maskedM[0]) + staticFor i, 1, a.len-1: + t[i] = add_cio(t[i], maskedM[i]) + when N > 1: + t[N-1] = add_ci(t[N-1], maskedM[N-1]) + + return t + + proc mtymul_CIOS_sparebit(a, b, M: BigInt, finalReduce: bool): BigInt {.device.} = + ## Generate an optimized modular multiplication kernel + ## with parameters `a, b, modulus: Limbs -> Limbs` + var t: BigInt = BigInt() + template m0ninv: untyped = M0NInv + + # Algorithm + # ----------------------------------------- + # + # On x86, with a single carry chain and a spare bit: + # + # for i=0 to N-1 + # (A, t[0]) <- a[0] * b[i] + t[0] + # m <- (t[0] * m0ninv) mod 2ʷ + # (C, _) <- m * M[0] + t[0] + # for j=1 to N-1 + # (A, t[j]) <- a[j] * b[i] + A + t[j] + # (C, t[j-1]) <- m * M[j] + C + t[j] + # + # t[N-1] = C + A + # + # with MULX, ADCX, ADOX dual carry chains + # + # for i=0 to N-1 + # for j=0 to N-1 + # (A,t[j]) := t[j] + a[j]*b[i] + A + # m := t[0]*m0ninv mod W + # C,_ := t[0] + m*M[0] + # for j=1 to N-1 + # (C,t[j-1]) := t[j] + m*M[j] + C + # t[N-1] = C + A + # + # In our case, we only have a single carry flag + # but we have a lot of registers + # and a multiply-accumulate instruction + # + # Hence we can use the dual carry chain approach + # one chain after the other instead of interleaved like on x86. + + when N > 1: + staticFor i, 0, N: + # Multiplication + # ------------------------------- + # for j=0 to N-1 + # (A,t[j]) := t[j] + a[j]*b[i] + A + # + # for 4 limbs, implicit column-wise carries + # + # t[0] = t[0] + (a[0]*b[i]).lo + # t[1] = t[1] + (a[1]*b[i]).lo + (a[0]*b[i]).hi + # t[2] = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi + # t[3] = t[3] + (a[3]*b[i]).lo + (a[2]*b[i]).hi + # overflow = (a[3]*b[i]).hi + # + # or + # + # t[0] = t[0] + (a[0]*b[i]).lo + # t[1] = t[1] + (a[0]*b[i]).hi + (a[1]*b[i]).lo + # t[2] = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi + # t[3] = t[3] + (a[2]*b[i]).hi + (a[3]*b[i]).lo + # overflow = carry + (a[3]*b[i]).hi + # + # Depending if we chain lo/hi or even/odd + # The even/odd carry chain is more likely to be optimized via μops-fusion + # as it's common to compute the full product. That said: + # - it's annoying if the number of limbs is odd with edge conditions. + # - GPUs are RISC architectures and unlikely to have clever instruction rescheduling logic + let bi = b[i] + var A = 0'u32 + + if i == 0: + staticFor j, 0, N: + t[j] = mul_lo(a[j], bi) + else: + t[0] = mulloadd_co(a[0], bi, t[0]) + staticFor j, 1, N: + t[j] = mulloadd_cio(a[j], bi, t[j]) + A = add_ci(0'u32, 0'u32) # assumes N > 1 + t[1] = mulhiadd_co(a[0], bi, t[1]) # assumes N > 1 + staticFor j, 2, N: + t[j] = mulhiadd_cio(a[j-1], bi, t[j]) + A = mulhiadd_ci(a[N-1], bi, A) + # Reduction + # ------------------------------- + # m := t[0]*m0ninv mod W + # + # C,_ := t[0] + m*M[0] + # for j=1 to N-1 + # (C,t[j-1]) := t[j] + m*M[j] + C + # t[N-1] = C + A + # + # for 4 limbs, implicit column-wise carries + # _ = t[0] + (m*M[0]).lo + # t[0] = t[1] + (m*M[1]).lo + (m*M[0]).hi + # t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi + # t[2] = t[3] + (m*M[3]).lo + (m*M[2]).hi + # t[3] = A + carry + (m*M[3]).hi + # + # or + # + # _ = t[0] + (m*M[0]).lo + # t[0] = t[1] + (m*M[0]).hi + (m*M[1]).lo + # t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi + # t[2] = t[3] + (m*M[2]).hi + (m*M[3]).lo + # t[3] = A + carry + (m*M[3]).hi + + let m = mul_lo(t[0], m0ninv) + let _ = mulloadd_co(m, M[0], t[0]) + staticFor j, 1, N: + t[j-1] = mulloadd_cio(m, M[j], t[j]) + t[N-1] = add_ci(A, 0) + # assumes N > 1 + t[0] = mulhiadd_co(m, M[0], t[0]) + staticFor j, 1, N-1: + t[j] = mulhiadd_cio(m, M[j], t[j]) + t[N-1] = mulhiadd_ci(m, M[N-1], t[N-1]) + else: # single limb, e.g. BabyBear (N=1) + # 1. Compute t = a * b (low and high, emulates lagged code in N limb branch) + # 2. Compute m = t * m0ninv mod 2^32 + # 3. Compute t = (t + m*M) >> 32 + + # Step 1: t = a * b + let t0 = mul_lo(a[0], b[0]) # lower 32 bit + let t1 = mul_hi(a[0], b[0]) # upper 32 bit + + # Step 2: m = t * m0ninv mod 2^32 + let m = mul_lo(t0, m0ninv) + + # Step 3: t = (t + m*M) >> 32 + let _ = mulloadd_co(m, M[0], t0) # Low word discarded, but calc for possible carry + t[0] = mulhiadd_ci(m, M[0], t1) + + if finalReduce: + t = finalSubNoOverflow(t, M) + + return t + ## More general field operations + + proc setZero(a: var BigInt) {.device.} = + ## Sets all limbs of the field element to zero in place + # Zero all limbs + for i in 0 ..< N: + a[i] = 0'u32 + + proc setOne(a: var BigInt) {.device.} = + ## Sets the field element to one in Montgomery form + ## For a field element to be valid in Montgomery form, + ## we need x · R mod M with R = 2^(WordBitWidth * numWords) + template montyOne: untyped = MontyOne # Get the Montgomery form of 1 from static context + # Copy the Montgomery form of 1 + for i in 0 ..< N: + a[i] = montyOne[i] # .uint32 + + proc add(r: var BigInt, a, b: BigInt) {.device.} = + ## Addition of two finite field elements stored in `a` and `b`. + ## The result is stored in `r`. + r = modadd(a, b, M) + + proc sub(r: var BigInt, a, b: BigInt) {.device.} = + ## Subtraction of two finite field elements stored in `a` and `b`. + ## The result is stored in `r`. + r = modsub(a, b, M) + + proc mul(r: var BigInt, a, b: BigInt) {.device.} = + ## Multiplication of two finite field elements stored in `a` and `b`. + ## The result is stored in `r`. + r = mtymul_CIOS_sparebit(a, b, M, true) + + proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} = + ## Conditional copy in CUDA + ## If condition is true: b is copied into a + ## If condition is false: a is left unmodified + ## + ## Note: This is constant-time + # Use selp instruction for constant-time selection: + # if condition then b else a + ## XXX: add support for `IfExpr`! Requires though. + var cond: int32 + if condition: + cond = 1'i32 + else: + cond = -1'i32 # `slct` checks for `>= 0` as the true branch! + for i in 0 ..< N: + a[i] = slct(b[i], a[i], cond) + + proc csetZero(r: var BigInt, condition: bool) {.device.} = + ## Conditionally set `r` to zero in CUDA + ## + ## Note: This is constant-time + var t = BigInt() + t.setZero() + r.ccopy(t, condition) + + proc csetOne(r: var BigInt, condition: bool) {.device.} = + ## Conditionally set `r` to one in CUDA + ## + ## Note: This is constant-time + template mOne: untyped = MontyOne + r.ccopy(mOne, condition) + + proc cadd(r: var BigInt, a: BigInt, condition: bool) {.device.} = + ## Conditionally add `a` to `r` in place in CUDA. + ## + ## Note: This is constant-time + var t = BigInt() + t.add(r, a) + r.ccopy(t, condition) + + proc csub(r: var BigInt, a: BigInt, condition: bool) {.device.} = + ## Conditionally subtract `a` from `r` in place in CUDA. + ## + ## Note: This is constant-time + var t = BigInt() + t.sub(r, a) + r.ccopy(t, condition) + + proc doubleElement(r: var BigInt, a: BigInt) {.device.} = + ## Double `a` and store it in `r` in CUDA. + ## + ## Note: This is constant-time + r.add(a, a) + + proc nsqr(r: var BigInt, a: BigInt, count: int) {.device.} = + ## Performs `nsqr`, that is multiple squarings of `a` and stores it in `r` + ## in CUDA. + ## + ## Note: This is constant-time + ## + ## TODO: Add a `skipFinalSub` argument? + r = a # copy over a + for i in 0 ..< count-1: + r = mtymul_CIOS_sparebit(r, r, M, finalReduce = false) + # last one with reducing + r = mtymul_CIOS_sparebit(r, r, M, finalReduce = true) + + proc isZero(r: var bool, a: BigInt) {.device.} = + ## Checks if `a` is zero in CUDA. Result is written to `r`. + ## + ## Note: This is constant-time + #r = true + #staticFor i, 0, a.len: + # r = r and a[i] == 0'u32 + var isZero = a[0] + staticFor i, 0, a.len: + isZero = isZero or a[i] + r = isZero == 0'u32 + + proc isOdd(r: var bool, a: BigInt) {.device.} = + ## Checks if the Montgomery value of `a` is odd in CUDA. Result is written to `r`. + ## + ## IMPORTANT: The canonical value may or may not be odd if the Montgomery + ## representation is odd (and vice versa!). + ## + ## Note: This is constant-time + # check if least significant byte has first bit set + r = (a[0] and 1'u32).bool + + proc neg(r: var BigInt, a: BigInt) {.device.} = + ## Computes the negation of `a` and stores it in `r` in CUDA. + ## + ## Note: This is constant-time + # Check if input is zero + var isZ: bool = false + isZ.isZero(a) + # Subtraction `M - a` + var t = BigInt() + ## XXX: Is it safe to use `modsub` here? + t.sub(M, a) + # If input zero, we want `r = 0` instead of `r = M`! + t.csetZero(isZ) + r = t + + proc cneg(r: var BigInt, a: BigInt, condition: bool) {.device.} = + ## Conditionally negate `a` and store it in `r` if `condition` is true, otherwise + ## copy over `a` into `r` in CUDA. + ## + ## Note: This is constant-time + r.neg(a) + r.ccopy(a, not condition) + + proc shiftRight(r: var BigInt, k: uint32) {.device.} = + ## Shift `r` right by `k` bits in-nplace in CUDA. + ## + ## k MUST be less than the base word size (2^31) + ## + ## Note: This is constant-time + let wordBitWidth = sizeof(uint32) * 8 + let shiftLeft = wordBitWidth.uint32 - k + + # process all but the last word + staticFor i, 0, r.len - 1: + let current = r[i] + let next = r[i + 1] + + let rightPart = current shr k + let leftPart = next shl shiftLeft + r[i] = rightPart or leftPart + + # handle the last word + let lastIdx = r.len - 1 + r[lastIdx] = r[lastIdx] shr k + + proc div2(r: var BigInt) {.device.} = + ## Divide `r` by 2 in-place in CUDA. + ## + ## Note: This is constant-time + # check if the input is odd + var isO: bool = false + isO.isOdd(r) + + # perform the division using a right shift + r.shiftRight(1) + + # if it was odd, add `M+1/2` to go 'half-way around' + r.cadd(PP1D2, isO) + diff --git a/constantine/math_compiler/experimental/runtime_compile.nim b/constantine/math_compiler/experimental/runtime_compile.nim new file mode 100644 index 000000000..038c8f920 --- /dev/null +++ b/constantine/math_compiler/experimental/runtime_compile.nim @@ -0,0 +1,326 @@ +# Constantine +# Copyright (c) 2018-2019 Status Research & Development GmbH +# Copyright (c) 2020-Present Mamy André-Ratsimbazafy +# Licensed and distributed under either of +# * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT). +# * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0). +# at your option. This file may not be copied, modified, or distributed except according to those terms. + +import + std / [strformat, strutils] + +import constantine/platforms/abis/nvidia_abi + +import ./nim_ast_to_cuda_ast +import ./cuda_execute_dsl +export cuda_execute_dsl +export nim_ast_to_cuda_ast + +## Set to true, if you want some extra output (driver & runtime version for example) +const DebugCuda {.booldefine.} = true + +## Dummy data for the typed nature of the `cuda` macro. These define commonly used +## CUDA specific names so that they produce valid Nim code in the context of a typed macro. +template global*() {.pragma.} +template device*() {.pragma.} +template forceinline*() {.pragma.} + +# If attached to a `var` it will be treated as a +# `__constant__`! Only useful if you want to define a +# constant without initializing it (and then use +# `cudaMemcpyToSymbol` / `copyToSymbol` to initialize it +# before executing the kernel) +template constant*() {.pragma.} +type + Dim* = cint ## dummy to have access to math + NvBlockIdx* = object + x*: Dim + y*: Dim + z*: Dim + NvBlockDim = object + x*: Dim + y*: Dim + z*: Dim + NvThreadIdx* = object + x*: Dim + y*: Dim + z*: Dim + NvGridDim = object + x*: Dim + y*: Dim + z*: Dim + + +## These are dummy elements to make CUDA block / thread index / dim +## access possible in the *typed* `cuda` macro. It cannot be `const`, +## because then the typed code would evaluate the values before we +## can work with it from the typed macro. +let blockIdx* = NvBlockIdx() +let blockDim* = NvBlockDim() +let gridDim* = NvGridDim() +let threadIdx* = NvThreadIdx() + +## Similar for procs. They don't need any implementation, as they won't ever be actually called. +proc printf*(fmt: string) {.varargs.} = discard +proc memcpy*(dst, src: pointer, size: int) = discard + +## `cuExtern` is mapped to `extern`, but has a different name, because Nim has its +## own `extern` pragma (due to requiring an argument it cannot be reused): +## https://nim-lang.org/docs/manual.html#foreign-function-interface-extern-pragma +template cuExtern*(): untyped {.pragma.} +template shared*(): untyped {.pragma.} +## You would typically use `cuExtern` and `shared` together: +## `var x {.cuExtern, shared.}: array[N, Foo]` +## for example to declare a constant array that is filled by the +## host before kernel execution. + +## While you can use `malloc` on device with small sizes, it is usually not +## recommended to do so. +proc malloc*(size: csize_t): pointer = discard +proc free*(p: pointer) = discard +proc syncthreads*() {.cudaName: "__syncthreads".} = discard + + + +type + NVRTC* = object + numBlocks* = 32 # number of blocks to launch + threadsPerBlock* = 128 # number of threads for each block. Total threads: `numBlocks * threadsPerBlock` + name*: string # Name of the program (of the generated in memory CUDA file) + prog*: nvrtcProgram + log*: string # The compilation log + ptx*: string # PTX of the program + cubin*: pointer + cubinSize*: csize_t + device*: CUdevice + kernel*: CUfunction + module*: CUmodule + context*: CUcontext + moduleLoaded*: bool + +proc `=destroy`(nvrtc: NVRTC) = + if nvrtc.module.pointer != nil: + check cuModuleUnload nvrtc.module + if nvrtc.context.pointer != nil: + check cuCtxDestroy nvrtc.context + +proc initNvrtc*(cuda: string, name = "sample.cu"): NVRTC = + ## Initializes an NVRTC object for the given program `cuda` + when DebugCuda: + var x: cint + check cuDriverGetVersion(x.addr) + echo "Driver version: ", x + + var rtVer: cint + echo cudaRuntimeGetVersion(addr rtVer) + echo "Runtime ver: ", rtVer + + var prop: cudaDeviceProp + echo cudaGetDeviceProperties(addr prop, 0); + echo "Compute capability: ", prop.major, " ", prop.minor + + var + context: CUcontext + device: CUdevice + + check cuInit(0) + check cuDeviceGet(device, 0) + check cuCtxCreate(context, 0, device) + + # Create an instance of nvrtcProgram based on the passed code + var prog: nvrtcProgram + check nvrtcCreateProgram(addr(prog), cstring cuda, cstring name, 0, nil, nil) + + result = NVRTC(prog: prog, name: name, + device: device, + context: context) + + +proc log*(nvrtc: var NVRTC) = + ## Retrieve the compilation log. + var logSize: csize_t + check nvrtcGetProgramLogSize(nvrtc.prog, addr logSize) + + var log = cstring newString(Natural logSize) + + check nvrtcGetProgramLog(nvrtc.prog, log) + nvrtc.log = $log # usually empty if no issues found by the compiler + +proc compile*(nvrtc: var NVRTC) = + # Compile the program with fmad disabled. + # Note: Can specify GPU target architecture explicitly with '-arch' flag. + const + Options = [ + cstring "--gpu-architecture=compute_61", # or whatever your GPU arch is + # "--fmad=false", # and whatever other options for example + ] + + NumberOfOptions = cint Options.len + let compileResult = nvrtcCompileProgram(nvrtc.prog, NumberOfOptions, + cast[cstringArray](addr Options[0])) + + nvrtc.log() + ## XXX: only in `DebugCuda`? + echo "Compilation log:\n------------------------------" + echo nvrtc.log + echo "------------------------------" + check compileResult + +proc getPtx*(nvrtc: var NVRTC) = + ## Obtain PTX from the program. + var ptxSize: csize_t + check nvrtcGetPTXSize(nvrtc.prog, addr ptxSize) + + var ptx = newString(int ptxSize) + check nvrtcGetPTX(nvrtc.prog, ptx) + + check nvrtcDestroyProgram(addr nvrtc.prog) # Destroy the program. + nvrtc.ptx = ptx + + when DebugCuda: + echo "PTX size: ", ptxSize + #echo "-------------------- PTX --------------------\n", nvrtc.ptx + writeFile("/tmp/kernel.ptx", nvrtc.ptx) + +proc load*(nvrtc: var NVRTC) = + # After getting the PTX... + var error_log = newString(8192) + var info_log = newString(8192) + + ## NOTE: if you wish to use the `link` approach, pass `nvrtc.cubin` instead of `PTX` + #let status = cuModuleLoadData(addr nvrtc.module, nvrtc.cubin) + let status = cuModuleLoadData(nvrtc.module, cstring nvrtc.ptx) + if status != CUDA_SUCCESS: + var error_str: cstring #const char* error_str; + check cuGetErrorString(status, cast[cstringArray](addr error_str)); + echo "Module load failed: ", error_str + echo "JIT Error log: ", error_log + echo "JIT Info log: ", info_log + quit(1) + + nvrtc.moduleLoaded = true + +proc link*(nvrtc: var NVRTC) = + ## OPTIONAL STEP. Alternative to passing the PTX to `cuModuleLoadData`. + # Create linker + var linkState: CUlinkState + var linkOptions: array[4, CUjit_option] + var linkOptionValues: array[4, pointer] + var errorLog = newString(4096) + var infoLog = newString(4096) + var walltime: float32 + + linkOptions[0] = CU_JIT_WALL_TIME + linkOptionValues[0] = addr walltime + linkOptions[1] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES + linkOptionValues[1] = cast[pointer](4096) + linkOptions[2] = CU_JIT_ERROR_LOG_BUFFER + linkOptionValues[2] = addr errorLog[0] + linkOptions[3] = CU_JIT_INFO_LOG_BUFFER + linkOptionValues[3] = addr infoLog[0] + + check cuLinkCreate(3, addr linkOptions[0], addr linkOptionValues[0], addr linkState) + + # Add PTX + var res = cuLinkAddData(linkState, CU_JIT_INPUT_PTX, + cast[pointer](cstring nvrtc.ptx), + csize_t nvrtc.ptx.len, + "kernel.ptx", 0, nil, nil) + + var status: CUresult + if res != CUDA_SUCCESS: + var error_str: cstring + #discard cuGetErrorString(res, addr error_str) + check cuGetErrorString(status, cast[cstringArray](addr error_str)) + echo "Link add PTX failed: ", error_str + echo "Error log: ", errorLog + quit(1) + + # Add the device runtime (provides printf support) + ## NOTE: Linking requires yout to pass the path to `libcudadevrt.a` at CT + res = cuLinkAddFile(linkState, CU_JIT_INPUT_LIBRARY, + "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a", # Adjust path as needed + 0, nil, nil) + if res != CUDA_SUCCESS: + var error_str: cstring + check cuGetErrorString(status, cast[cstringArray](addr error_str)); + echo "Link add device runtime failed: ", error_str + echo "Error log: ", errorLog + quit(1) + + # Complete linking + var cubn: pointer + var cubinSize: csize_t + res = cuLinkComplete(linkState, cubn.addr, cubinSize.addr) + nvrtc.cubinSize = cubinSize + if res != CUDA_SUCCESS: + var error_str: cstring + check cuGetErrorString(status, cast[cstringArray](addr error_str)); + echo "Link complete failed: ", error_str + echo "Error log: ", errorLog + quit(1) + + when DebugCuda: + echo "[INFO]: Writing CUBIN data to file /tmp/test.cubin" + echo "Cubin size: ", cubinSize + var f = open("/tmp/test.cubin", fmWrite) + discard f.writeBuffer(cubn, cubinSize) + f.close() + + # Assign the cubin + nvrtc.cubin = cubn + +proc copyToSymbol*[T](nvrtc: NVRTC, symbol: string, data: T, offset = 0) = + ## Copies `data` to the symbol in the current CUDA kernel. + ## There is absolutely type safety involved here. We only check that the amount of + ## data you wish to copy to the global matches the size of the global storage. + ## This function does help you with automatically copying `seq[T]` for example. + ## + ## `offset` is an optional offset of the number of bytes at the target we want + ## to copy to. Useful to copy only individual elements of a constant array for example. + ## + ## Say you define in a kernel: + ## + ## ```nim + ## let foo = cuda: + ## var data {.constant.}: array[1024, uint32] + ## # ... + ## # later in host code after getting the kernel from the `nvrtc` object: + ## let data = calcSomeArray1024() # runtime calculation + ## copyToSymbols("data", # name of the variable in CUDA code + ## data) + ## ``` + var devPtr: CUdeviceptr + var size: csize_t + check cuModuleGetGlobal(devPtr, size.addr, nvrtc.module, symbol.cstring) + var totSize: int + var srcPtr: pointer + when T is seq: # copy len * sizeof(element) + doAssert data.len > 0, "Input data is empty!" + let elSize = sizeof(data[0]) + totSize = data.len * sizeof(elSize) + srcPtr = data[0].addr + + else: + # For now just copy by `sizeof`! + totSize = sizeof(data) + srcPtr = data.addr + doAssert totSize.csize_t == size, "Input data size does not match size of global to copy to: " & $totSize & " vs. " & $size + check cuMemcpyHtoD(devPtr, srcPtr, csize_t(totSize)) + +template execute*(nvrtc: var NVRTC, fn: string, res, inputs: typed, sharedMemSize: typed) = + ## Load the generated PTX, get the target kernel `fn` and execute it with the `res` and `inputs` + + if not nvrtc.moduleLoaded: + nvrtc.load() + + check cuModuleGetFunction(nvrtc.kernel, nvrtc.module, fn) + + # now execute the kernel + execCuda(nvrtc.kernel, nvrtc.numBlocks, nvrtc.threadsPerBlock, res, inputs, sharedMemSize) + + # synchronize so that e.g. `printf` statements will be printed before we (possibly) quit + check cuCtxSynchronize() # + +template execute*(nvrtc: var NVRTC, fn: string, res, inputs: typed) = + nvrtc.execute(fn, res, inputs, 0) diff --git a/constantine/named/config_fields_and_curves.nim b/constantine/named/config_fields_and_curves.nim index 330d4bf5d..77d9072b4 100644 --- a/constantine/named/config_fields_and_curves.nim +++ b/constantine/named/config_fields_and_curves.nim @@ -65,6 +65,11 @@ declareCurves: testingCurve: true bitwidth: 16 modulus: "0xFFEF" # 65519 in hex + + curve BabyBear: # BabyBear field used in SNARKs and zkVMs + bitwidth: 31 + modulus: "0x78000001" # `0b1111000000000000000000000000001 = 15·2²⁷ + 1 = 2³¹ - 2²⁷ + 1` + curve Mersenne61: testingCurve: true bitwidth: 61 diff --git a/constantine/platforms/abis/libpaths.nim b/constantine/platforms/abis/libpaths.nim new file mode 100644 index 000000000..54ddff076 --- /dev/null +++ b/constantine/platforms/abis/libpaths.nim @@ -0,0 +1,166 @@ +# This file is taken from `nimcuda`. + +##[This module implements some auto-detection of cuda installation locations, + as well as communication with the c compilers about this info. + + If you want to manually overide the autodetection, pass the nim compiler + `-d:CudaLib="PATH_TO_CUDA_DYN_LIBS"` and/or + `-d:CudaIncludes="PATH_TO_CUDA_HEADERS"`. +]## + +#[The following is a rip of std/distros, slightly modified for compile-time + use. + The extra specificity compared to normal `defined` tests or `hostOS` + is needed because some linux distros install cuda in very different places + (im looking at you, arch!) +]# + + +from std/distros import Distribution +import std/[os, strutils, macros, macrocache] +when NimMajor == 2: + import std/envvars + + +# we cache the result of the 'cmdRelease' +# execution for faster platform detections. +var + unameRes {.compileTime.}: string + osReleaseIDRes {.compileTime.}: string + releaseRes {.compileTime.}: string + hostnamectlRes {.compileTime.}: string + +template cmdRelease(cmd, cache): untyped = + if cache.len == 0: + # cache = (when defined(nimscript): gorge(cmd) else: execProcess(cmd)) + cache = gorge(cmd) + cache + +template uname(): untyped = cmdRelease("uname -a", unameRes) +template osReleaseID(): untyped = + cmdRelease("cat /etc/os-release | grep ^ID=", osReleaseIDRes) +template release(): untyped = cmdRelease("lsb_release -d", releaseRes) +template hostnamectl(): untyped = cmdRelease("hostnamectl", hostnamectlRes) + +proc detectOsWithAllCmd(d: Distribution): bool {.compileTime.} = + let dd = toLowerAscii($d) + result = dd in toLowerAscii(osReleaseID()) or dd in toLowerAscii(release()) or + dd in toLowerAscii(uname()) or ("operating system: " & dd) in + toLowerAscii(hostnamectl()) + +proc detectOsImpl(d: Distribution): bool {.compileTime.} = + case d + of Distribution.Windows: result = defined(windows) + of Distribution.Posix: result = defined(posix) + of Distribution.MacOSX: result = defined(macosx) + of Distribution.Linux: result = defined(linux) + of Distribution.BSD: result = defined(bsd) + else: + when defined(bsd): + case d + of Distribution.FreeBSD, Distribution.NetBSD, Distribution.OpenBSD: + result = $d in uname() + else: + result = false + elif defined(linux): + const EasyLinux = when (NimMajor, NimMinor) >= (1, 6): + {Distribution.Elementary, Distribution.Ubuntu, Distribution.Debian, + Distribution.Fedora, Distribution.OpenMandriva, Distribution.CentOS, + Distribution.Alpine, Distribution.Mageia, Distribution.Zorin, + Distribution.Void} + else: + {Distribution.Elementary, Distribution.Ubuntu, Distribution.Debian, + Distribution.Fedora, Distribution.OpenMandriva, Distribution.CentOS, + Distribution.Alpine, Distribution.Mageia, Distribution.Zorin} + + case d + of Distribution.Gentoo: + result = ("-" & $d & " ") in uname() + of EasyLinux: + result = toLowerAscii($d) in osReleaseID() + of Distribution.RedHat: + result = "rhel" in osReleaseID() + of Distribution.ArchLinux: + result = "arch" in osReleaseID() + # when (NimMajor, NimMinor) >= (1, 6): + # of Distribution.Artix: + # result = "artix" in osReleaseID() + of Distribution.NixOS: + # Check if this is a Nix build or NixOS environment + result = existsEnv("NIX_BUILD_TOP") or + existsEnv("__NIXOS_SET_ENVIRONMENT_DONE") + of Distribution.OpenSUSE: + result = "suse" in toLowerAscii(uname()) or + "suse" in toLowerAscii(release()) + of Distribution.GoboLinux: + result = "-Gobo " in uname() + of Distribution.Solaris: + let uname = toLowerAscii(uname()) + result = ("sun" in uname) or ("solaris" in uname) + of Distribution.Haiku: + result = defined(haiku) + else: + result = detectOsWithAllCmd(d) + else: + result = false + +template detectOs(d: untyped): bool = + ## Distro/OS detection. For convenience, the + ## required `Distribution.` qualifier is added to the + ## enum value. + detectOsImpl(Distribution.d) + + + +# begin actual detection +when detectOs(Windows): + from std/os import getEnv, `/` + const + CudaPath = getEnv("CUDA_PATH") + CudaIncludes* {.strdefine.} = CudaPath / "include" + CudaLib* {.strdefine.} = CudaPath / "lib64" + +elif detectOs(ArchLinux): + from std/os import `/` + const + CudaPath = "/opt/cuda" + CudaIncludes* {.strdefine.} = CudaPath / "include" + CudaLib* {.strdefine.} = CudaPath / "lib64" + +elif detectOs(Linux): + # Generic linux catch-all. + # This includes anyone following the cuda installation guide. + const + CudaPath = "/usr/local/cuda" + CudaIncludes* {.strdefine.} = CudaPath / "include" + CudaLib* {.strdefine.} = CudaPath / "lib64" + +else: + # Some wild operating system! + const + CudaIncludes* {.strdefine.} = "unknown" + CudaLib* {.strdefine.} = "unknown" + + +# check for validity +when not dirExists(CudaIncludes): + {.error: "Could not find the cuda source headers! Please specify the " & + "location of the cuda includes directory by passing " & + "`-d:CudaIncludes=\"YOUR_PATH\"` to the nim compiler.".} +elif not dirExists(CudaLib): + {.error: "Could not find the cuda shared libraries! Please specify the " & + "location of the cuda library directory by passing " & + "`-d:CudaLib=\"YOUR_PATH\"` to the nim compiler.".} + + + +macro tellCompilerToUseCuda*(): untyped = + ## Tells the compiler and linker to use cuda libraries. + # we'll use macrocaching so that we dont unneccessarily emit a million times + + const ToldCompilerCount = CacheCounter"ToldCompilerToUseCudaCount" + if ToldCompilerCount.value == 0: + result = quote do: + {.passC: "-I" & CudaIncludes.} + {.passL: "-L" & CudaLib & " -lcuda".} + inc ToldCompilerCount diff --git a/constantine/platforms/abis/nvidia_abi.nim b/constantine/platforms/abis/nvidia_abi.nim index 02dd8808f..f10c31071 100644 --- a/constantine/platforms/abis/nvidia_abi.nim +++ b/constantine/platforms/abis/nvidia_abi.nim @@ -20,9 +20,9 @@ import ./c_abi # # ############################################################ -const libPath = "/opt/cuda/lib64/" # For now, only support Linux -static: echo "[Constantine] Will search Cuda runtime in $LD_LIBRARY_PATH and " & libPath & "libcuda.so" -const libCuda = "(libcuda.so|" & libPath & "libcuda.so)" +import ./libpaths +tellCompilerToUseCuda() +const libCuda = "(libcuda.so|libcuda.so)" # Cuda offers 2 APIs: # - cuda.h the driver API @@ -483,7 +483,331 @@ type CUstream* = distinct pointer CUdeviceptr* = distinct pointer -{.push noconv, importc, dynlib: libCuda.} + CUlinkState_st = object + CUlinkState* = ptr CUlinkState_st + +###################################################################### +################################ cuda.h ############################## +###################################################################### + +type ## + ## Max number of registers that a thread may use.\n + ## Option type: unsigned int\n + ## Applies to: compiler only + ## + CUjit_option* {.size: sizeof(cint).} = enum + CU_JIT_MAX_REGISTERS = 0, ## + ## IN: Specifies minimum number of threads per block to target compilation + ## for\n + ## OUT: Returns the number of threads the compiler actually targeted. + ## This restricts the resource utilization of the compiler (e.g. max + ## registers) such that a block with the given number of threads should be + ## able to launch based on register limitations. Note, this option does not + ## currently take into account any other resource limitations, such as + ## shared memory utilization.\n + ## Cannot be combined with ::CU_JIT_TARGET.\n + ## Option type: unsigned int\n + ## Applies to: compiler only + ## + CU_JIT_THREADS_PER_BLOCK = 1, ## + ## Overwrites the option value with the total wall clock time, in + ## milliseconds, spent in the compiler and linker\n + ## Option type: float\n + ## Applies to: compiler and linker + ## + CU_JIT_WALL_TIME = 2, ## + ## Pointer to a buffer in which to print any log messages + ## that are informational in nature (the buffer size is specified via + ## option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n + ## Option type: char *\n + ## Applies to: compiler and linker + ## + CU_JIT_INFO_LOG_BUFFER = 3, ## + ## IN: Log buffer size in bytes. Log messages will be capped at this size + ## (including null terminator)\n + ## OUT: Amount of log buffer filled with messages\n + ## Option type: unsigned int\n + ## Applies to: compiler and linker + ## + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4, ## + ## Pointer to a buffer in which to print any log messages that + ## reflect errors (the buffer size is specified via option + ## ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n + ## Option type: char *\n + ## Applies to: compiler and linker + ## + CU_JIT_ERROR_LOG_BUFFER = 5, ## + ## IN: Log buffer size in bytes. Log messages will be capped at this size + ## (including null terminator)\n + ## OUT: Amount of log buffer filled with messages\n + ## Option type: unsigned int\n + ## Applies to: compiler and linker + ## + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6, ## + ## Level of optimizations to apply to generated code (0 - 4), with 4 + ## being the default and highest level of optimizations.\n + ## Option type: unsigned int\n + ## Applies to: compiler only + ## + CU_JIT_OPTIMIZATION_LEVEL = 7, ## + ## No option value required. Determines the target based on the current + ## attached context (default)\n + ## Option type: No option value needed\n + ## Applies to: compiler and linker + ## + CU_JIT_TARGET_FROM_CUCONTEXT = 8, ## + ## Target is chosen based on supplied ::CUjit_target. Cannot be + ## combined with ::CU_JIT_THREADS_PER_BLOCK.\n + ## Option type: unsigned int for enumerated type ::CUjit_target\n + ## Applies to: compiler and linker + ## + CU_JIT_OPTION_TARGET = 9, ## + ## Specifies choice of fallback strategy if matching cubin is not found. + ## Choice is based on supplied ::CUjit_fallback. This option cannot be + ## used with cuLink* APIs as the linker requires exact matches.\n + ## Option type: unsigned int for enumerated type ::CUjit_fallback\n + ## Applies to: compiler only + ## + CU_JIT_FALLBACK_STRATEGY = 10, ## + ## Specifies whether to create debug information in output (-g) + ## (0: false, default)\n + ## Option type: int\n + ## Applies to: compiler and linker + ## + CU_JIT_GENERATE_DEBUG_INFO = 11, ## + ## Generate verbose log messages (0: false, default)\n + ## Option type: int\n + ## Applies to: compiler and linker + ## + CU_JIT_LOG_VERBOSE = 12, ## + ## Generate line number information (-lineinfo) (0: false, default)\n + ## Option type: int\n + ## Applies to: compiler only + ## + CU_JIT_GENERATE_LINE_INFO = 13, ## + ## Specifies whether to enable caching explicitly (-dlcm) \n + ## Choice is based on supplied ::CUjit_cacheMode_enum.\n + ## Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n + ## Applies to: compiler only + ## + CU_JIT_OPTION_CACHE_MODE = 14, ## + ## \deprecated + ## This jit option is deprecated and should not be used. + ## + CU_JIT_NEW_SM3X_OPT = 15, ## + ## This jit option is used for internal purpose only. + ## + CU_JIT_FAST_COMPILE = 16, ## + ## Array of device symbol names that will be relocated to the corresponding + ## host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n + ## Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n + ## When loading a device module, driver will relocate all encountered + ## unresolved symbols to the host addresses.\n + ## It is only allowed to register symbols that correspond to unresolved + ## global variables.\n + ## It is illegal to register the same device symbol at multiple addresses.\n + ## Option type: const char **\n + ## Applies to: dynamic linker only + ## + CU_JIT_GLOBAL_SYMBOL_NAMES = 17, ## + ## Array of host addresses that will be used to relocate corresponding + ## device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n + ## Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n + ## Option type: void **\n + ## Applies to: dynamic linker only + ## + CU_JIT_GLOBAL_SYMBOL_ADDRESSES = 18, ## + ## Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and + ## ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n + ## Option type: unsigned int\n + ## Applies to: dynamic linker only + ## + CU_JIT_GLOBAL_SYMBOL_COUNT = 19, ## + ## \deprecated + ## Enable link-time optimization (-dlto) for device code (Disabled by default).\n + ## This option is not supported on 32-bit platforms.\n + ## Option type: int\n + ## Applies to: compiler and linker + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_LTO = 20, ## + ## \deprecated + ## Control single-precision denormals (-ftz) support (0: false, default). + ## 1 : flushes denormal values to zero + ## 0 : preserves denormal values + ## Option type: int\n + ## Applies to: link-time optimization specified with CU_JIT_LTO + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_FTZ = 21, ## + ## \deprecated + ## Control single-precision floating-point division and reciprocals + ## (-prec-div) support (1: true, default). + ## 1 : Enables the IEEE round-to-nearest mode + ## 0 : Enables the fast approximation mode + ## Option type: int\n + ## Applies to: link-time optimization specified with CU_JIT_LTO + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_PREC_DIV = 22, ## + ## \deprecated + ## Control single-precision floating-point square root + ## (-prec-sqrt) support (1: true, default). + ## 1 : Enables the IEEE round-to-nearest mode + ## 0 : Enables the fast approximation mode + ## Option type: int\n + ## Applies to: link-time optimization specified with CU_JIT_LTO + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_PREC_SQRT = 23, ## + ## \deprecated + ## Enable/Disable the contraction of floating-point multiplies + ## and adds/subtracts into floating-point multiply-add (-fma) + ## operations (1: Enable, default; 0: Disable). + ## Option type: int\n + ## Applies to: link-time optimization specified with CU_JIT_LTO + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_FMA = 24, ## + ## \deprecated + ## Array of kernel names that should be preserved at link time while others + ## can be removed.\n + ## Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries.\n + ## Note that kernel names can be mangled by the compiler in which case the + ## mangled name needs to be specified.\n + ## Wildcard "*" can be used to represent zero or more characters instead of + ## specifying the full or mangled name.\n + ## It is important to note that the wildcard "*" is also added implicitly. + ## For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + ## thus preserve all kernels with those names. This can be avoided by providing + ## a more specific name like "barfoobaz".\n + ## Option type: const char **\n + ## Applies to: dynamic linker only + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_REFERENCED_KERNEL_NAMES = 25, ## + ## \deprecated + ## Number of entries in ::CU_JIT_REFERENCED_KERNEL_NAMES array.\n + ## Option type: unsigned int\n + ## Applies to: dynamic linker only + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_REFERENCED_KERNEL_COUNT = 26, ## + ## \deprecated + ## Array of variable names (__device__ and/or __constant__) that should be + ## preserved at link time while others can be removed.\n + ## Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries.\n + ## Note that variable names can be mangled by the compiler in which case the + ## mangled name needs to be specified.\n + ## Wildcard "*" can be used to represent zero or more characters instead of + ## specifying the full or mangled name.\n + ## It is important to note that the wildcard "*" is also added implicitly. + ## For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + ## thus preserve all variables with those names. This can be avoided by providing + ## a more specific name like "barfoobaz".\n + ## Option type: const char **\n + ## Applies to: link-time optimization specified with CU_JIT_LTO + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_REFERENCED_VARIABLE_NAMES = 27, ## + ## \deprecated + ## Number of entries in ::CU_JIT_REFERENCED_VARIABLE_NAMES array.\n + ## Option type: unsigned int\n + ## Applies to: link-time optimization specified with CU_JIT_LTO + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_REFERENCED_VARIABLE_COUNT = 28, ## + ## \deprecated + ## This option serves as a hint to enable the JIT compiler/linker + ## to remove constant (__constant__) and device (__device__) variables + ## unreferenced in device code (Disabled by default).\n + ## Note that host references to constant and device variables using APIs like + ## ::cuModuleGetGlobal() with this option specified may resultNotKeyWord in undefined behavior unless + ## the variables are explicitly specified using ::CU_JIT_REFERENCED_VARIABLE_NAMES.\n + ## Option type: int\n + ## Applies to: link-time optimization specified with CU_JIT_LTO + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES = 29, ## + ## Generate position independent code (0: false)\n + ## Option type: int\n + ## Applies to: compiler only + ## + CU_JIT_POSITION_INDEPENDENT_CODE = 30, ## + ## This option hints to the JIT compiler the minimum number of CTAs from the + ## kernel’s grid to be mapped to a SM. This option is ignored when used together + ## with ::CU_JIT_MAX_REGISTERS or ::CU_JIT_THREADS_PER_BLOCK. + ## Optimizations based on this option need ::CU_JIT_MAX_THREADS_PER_BLOCK to + ## be specified as well. For kernels already using PTX directive .minnctapersm, + ## this option will be ignored by default. Use ::CU_JIT_OVERRIDE_DIRECTIVE_VALUES + ## to let this option take precedence over the PTX directive. + ## Option type: unsigned int\n + ## Applies to: compiler only + ## + CU_JIT_MIN_CTA_PER_SM = 31, ## + ## Maximum number threads in a thread block, computed as the product of + ## the maximum extent specifed for each dimension of the block. This limit + ## is guaranteed not to be exeeded in any invocation of the kernel. Exceeding + ## the the maximum number of threads results in runtime error or kernel launch + ## failure. For kernels already using PTX directive .maxntid, this option will + ## be ignored by default. Use ::CU_JIT_OVERRIDE_DIRECTIVE_VALUES to let this + ## option take precedence over the PTX directive. + ## Option type: int\n + ## Applies to: compiler only + ## + CU_JIT_MAX_THREADS_PER_BLOCK = 32, ## + ## This option lets the values specified using ::CU_JIT_MAX_REGISTERS, + ## ::CU_JIT_THREADS_PER_BLOCK, ::CU_JIT_MAX_THREADS_PER_BLOCK and + ## ::CU_JIT_MIN_CTA_PER_SM take precedence over any PTX directives. + ## (0: Disable, default; 1: Enable) + ## Option type: int\n + ## Applies to: compiler only + ## + CU_JIT_OVERRIDE_DIRECTIVE_VALUES = 33, CU_JIT_NUM_OPTIONS + +type ## + ## Compiled device-class-specific device code\n + ## Applicable options: none + ## + CUjitInputType* {.size: sizeof(cint).} = enum + CU_JIT_INPUT_CUBIN = 0, ## + ## PTX source code\n + ## Applicable options: PTX compiler options + ## + CU_JIT_INPUT_PTX = 1, ## + ## Bundle of multiple cubins and/or PTX of some device code\n + ## Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + ## + CU_JIT_INPUT_FATBINARY = 2, ## + ## Host object with embedded device code\n + ## Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + ## + CU_JIT_INPUT_OBJECT = 3, ## + ## Archive of host objects with embedded device code\n + ## Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + ## + CU_JIT_INPUT_LIBRARY = 4, ## + ## \deprecated + ## High-level intermediate code for link-time optimization\n + ## Applicable options: NVVM compiler options, PTX compiler options + ## + ## Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0 + ## + CU_JIT_INPUT_NVVM = 5, CU_JIT_NUM_INPUT_TYPES = 6 + + + +{.push noconv, importc, dynlib: libCuda, header: "cuda.h".} proc cuInit*(flags: uint32): CUresult @@ -495,12 +819,15 @@ proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevi proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult proc cuCtxDestroy*(ctx: CUcontext): CUresult proc cuCtxSynchronize*(ctx: CUcontext): CUresult +proc cuCtxSynchronize*(): CUresult -proc cuModuleLoadData(module: var CUmodule, sourceCode: ptr char): CUresult {.used.} proc cuModuleUnload*(module: CUmodule): CUresult proc cuModuleGetFunction(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult {.used.} +proc cuModuleLoadData*(module: var CUmodule; image: pointer): CUresult +proc cuModuleGetFunction*(hfunc: var CUfunction; hmod: CUmodule; name: cstring): CUresult +proc cuModuleGetGlobal*(dptr: var CUdeviceptr, bytes: ptr csize_t, hmod: CUmodule, name: cstring): CUresult -proc cuLaunchKernel( +proc cuLaunchKernel*( kernel: CUfunction, gridDimX, gridDimY, gridDimZ: uint32, blockDimX, blockDimY, blockDimZ: uint32, @@ -516,4 +843,1026 @@ proc cuMemFree*(devptr: CUdeviceptr): CUresult proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult -{.pop.} # {.push noconv, importc, dynlib: "libcuda.so".} +proc cuDriverGetVersion*(driverVersion: ptr cint): CUresult + +proc cuLinkCreate*(numOptions: cuint; options: ptr CUjit_option; + optionValues: ptr pointer; stateOut: ptr CUlinkState): CUresult +proc cuLinkAddData*(state: CUlinkState; `type`: CUjitInputType; data: pointer; + size: csize_t; name: cstring; numOptions: cuint; + options: ptr CUjit_option; optionValues: ptr pointer): CUresult +proc cuLinkComplete*(state: CUlinkState; cubinOut: ptr pointer; sizeOut: ptr csize_t): CUresult + + +proc cuGetErrorString*(error: CUresult; pStr: cstringArray): CUresult + +proc cuLinkAddFile*(state: CUlinkState; `type`: CUjitInputType; path: cstring; + numOptions: cuint; options: ptr CUjit_option; + optionValues: ptr pointer): CUresult + + + + +{.pop.} # {.push noconv, importc, dynlib: "libcuda.so", header: "cuda.h"..} + + +###################################################################### +################################ nvrtc.h ############################# +###################################################################### + + +when defined(windows): + const + libNvrtc = "nvrtc64.dll" +elif defined(macosx): + const + libNvrtc = "libnvrtc.dylib" +else: + const + libNvrtc = "libnvrtc.so" + + +type + nvrtcProgramObj {.noDecl, incompleteStruct.} = object + nvrtcProgram* = ptr nvrtcProgramObj + + nvrtcResult* {.size: sizeof(cint).} = enum + NVRTC_SUCCESS = 0, NVRTC_ERROR_OUT_OF_MEMORY = 1, + NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, NVRTC_ERROR_INVALID_INPUT = 3, + NVRTC_ERROR_INVALID_PROGRAM = 4, NVRTC_ERROR_INVALID_OPTION = 5, + NVRTC_ERROR_COMPILATION = 6, NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7, + NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8, + NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9, + NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, NVRTC_ERROR_INTERNAL_ERROR = 11, + NVRTC_ERROR_TIME_FILE_WRITE_FAILED = 12 + +proc nvrtcCreateProgram*(prog: ptr nvrtcProgram; src: cstring; name: cstring; + numHeaders: cint; headers: cstringArray; + includeNames: cstringArray): nvrtcResult {.discardable, cdecl, + importc: "nvrtcCreateProgram", dynlib: libNvrtc.} + +proc nvrtcDestroyProgram*(prog: ptr nvrtcProgram): nvrtcResult {.discardable, cdecl, + importc: "nvrtcDestroyProgram", dynlib: libNvrtc.} + +proc nvrtcCompileProgram*(prog: nvrtcProgram; numOptions: cint; options: cstringArray): nvrtcResult {.discardable, + cdecl, importc: "nvrtcCompileProgram", dynlib: libNvrtc.} + +proc nvrtcGetPTXSize*(prog: nvrtcProgram; ptxSizeRet: ptr csize_t): nvrtcResult {.discardable, + cdecl, importc: "nvrtcGetPTXSize", dynlib: libNvrtc.} + +proc nvrtcGetPTX*(prog: nvrtcProgram; ptx: cstring): nvrtcResult {.discardable, cdecl, + importc: "nvrtcGetPTX", dynlib: libNvrtc.} + +proc nvrtcGetProgramLogSize*(prog: nvrtcProgram; logSizeRet: ptr csize_t): nvrtcResult {.discardable, + cdecl, importc: "nvrtcGetProgramLogSize", dynlib: libNvrtc.} + +proc nvrtcGetProgramLog*(prog: nvrtcProgram; log: cstring): nvrtcResult {.discardable, cdecl, + importc: "nvrtcGetProgramLog", dynlib: libNvrtc.} + + +###################################################################### +################################ libcudart ############################# +###################################################################### + +when defined(windows): + const + libCudaRT = "cudart.dll" +elif defined(macosx): + const + libCudaRT = "libcudart.dylib" +else: + const + libCudaRT = "libcudart.so" + +type + cudaError* = enum ## + ## The API call returned with no errors. In the case of query calls, this + ## also means that the operation being queried is complete (see + ## ::cudaEventQuery() and ::cudaStreamQuery()). + ## + cudaSuccess = 0, ## + ## This indicates that one or more of the parameters passed to the API call + ## is not within an acceptable range of values. + ## + cudaErrorInvalidValue = 1, ## + ## The API call failed because it was unable to allocate enough memory or + ## other resources to perform the requested operation. + ## + cudaErrorMemoryAllocation = 2, ## + ## The API call failed because the CUDA driver and runtime could not be + ## initialized. + ## + cudaErrorInitializationError = 3, ## + ## This indicates that a CUDA Runtime API call cannot be executed because + ## it is being called during process shut down, at a point in time after + ## CUDA driver has been unloaded. + ## + cudaErrorCudartUnloading = 4, ## + ## This indicates profiler is not initialized for this run. This can + ## happen when the application is running with external profiling tools + ## like visual profiler. + ## + cudaErrorProfilerDisabled = 5, ## + ## \deprecated + ## This error return is deprecated as of CUDA 5.0. It is no longer an error + ## to attempt to enable/disable the profiling via ::cudaProfilerStart or + ## ::cudaProfilerStop without initialization. + ## + cudaErrorProfilerNotInitialized = 6, ## + ## \deprecated + ## This error return is deprecated as of CUDA 5.0. It is no longer an error + ## to call cudaProfilerStart() when profiling is already enabled. + ## + cudaErrorProfilerAlreadyStarted = 7, ## + ## \deprecated + ## This error return is deprecated as of CUDA 5.0. It is no longer an error + ## to call cudaProfilerStop() when profiling is already disabled. + ## + cudaErrorProfilerAlreadyStopped = 8, ## + ## This indicates that a kernel launch is requesting resources that can + ## never be satisfied by the current device. Requesting more shared memory + ## per block than the device supports will trigger this error, as will + ## requesting too many threads or blocks. See ::cudaDeviceProp for more + ## device limitations. + ## + cudaErrorInvalidConfiguration = 9, ## + ## This indicates that one or more of the pitch-related parameters passed + ## to the API call is not within the acceptable range for pitch. + ## + cudaErrorInvalidPitchValue = 12, ## + ## This indicates that the symbol name/identifier passed to the API call + ## is not a valid name or identifier. + ## + cudaErrorInvalidSymbol = 13, ## + ## This indicates that at least one host pointer passed to the API call is + ## not a valid host pointer. + ## \deprecated + ## This error return is deprecated as of CUDA 10.1. + ## + cudaErrorInvalidHostPointer = 16, ## + ## This indicates that at least one device pointer passed to the API call is + ## not a valid device pointer. + ## \deprecated + ## This error return is deprecated as of CUDA 10.1. + ## + cudaErrorInvalidDevicePointer = 17, ## + ## This indicates that the texture passed to the API call is not a valid + ## texture. + ## + cudaErrorInvalidTexture = 18, ## + ## This indicates that the texture binding is not valid. This occurs if you + ## call ::cudaGetTextureAlignmentOffset() with an unbound texture. + ## + cudaErrorInvalidTextureBinding = 19, ## + ## This indicates that the channel descriptor passed to the API call is not + ## valid. This occurs if the format is not one of the formats specified by + ## ::cudaChannelFormatKind, or if one of the dimensions is invalid. + ## + cudaErrorInvalidChannelDescriptor = 20, ## + ## This indicates that the direction of the copyMem passed to the API call is + ## not one of the types specified by ::cudaMemcpyKind. + ## + cudaErrorInvalidMemcpyDirection = 21, ## + ## This indicated that the user has taken the address of a constant variable, + ## which was forbidden up until the CUDA 3.1 release. + ## \deprecated + ## This error return is deprecated as of CUDA 3.1. Variables in constant + ## memory may now have their address taken by the runtime via + ## ::cudaGetSymbolAddress(). + ## + cudaErrorAddressOfConstant = 22, ## + ## This indicated that a texture fetch was not able to be performed. + ## This was previously used for device emulation of texture operations. + ## \deprecated + ## This error return is deprecated as of CUDA 3.1. Device emulation mode was + ## removed with the CUDA 3.1 release. + ## + cudaErrorTextureFetchFailed = 23, ## + ## This indicated that a texture was not bound for access. + ## This was previously used for device emulation of texture operations. + ## \deprecated + ## This error return is deprecated as of CUDA 3.1. Device emulation mode was + ## removed with the CUDA 3.1 release. + ## + cudaErrorTextureNotBound = 24, ## + ## This indicated that a synchronization operation had failed. + ## This was previously used for some device emulation functions. + ## \deprecated + ## This error return is deprecated as of CUDA 3.1. Device emulation mode was + ## removed with the CUDA 3.1 release. + ## + cudaErrorSynchronizationError = 25, ## + ## This indicates that a non-float texture was being accessed with linear + ## filtering. This is not supported by CUDA. + ## + cudaErrorInvalidFilterSetting = 26, ## + ## This indicates that an attempt was made to read a non-float texture as a + ## normalized float. This is not supported by CUDA. + ## + cudaErrorInvalidNormSetting = 27, ## + ## Mixing of device and device emulation code was not allowed. + ## \deprecated + ## This error return is deprecated as of CUDA 3.1. Device emulation mode was + ## removed with the CUDA 3.1 release. + ## + cudaErrorMixedDeviceExecution = 28, ## + ## This indicates that the API call is not yet implemented. Production + ## releases of CUDA will never return this error. + ## \deprecated + ## This error return is deprecated as of CUDA 4.1. + ## + cudaErrorNotYetImplemented = 31, ## + ## This indicated that an emulated device pointer exceeded the 32-bit address + ## range. + ## \deprecated + ## This error return is deprecated as of CUDA 3.1. Device emulation mode was + ## removed with the CUDA 3.1 release. + ## + cudaErrorMemoryValueTooLarge = 32, ## + ## This indicates that the CUDA driver that the application has loaded is a + ## stub library. Applications that run with the stub rather than a real + ## driver loaded will resultNotKeyWord in CUDA API returning this error. + ## + cudaErrorStubLibrary = 34, ## + ## This indicates that the installed NVIDIA CUDA driver is older than the + ## CUDA runtime library. This is not a supported configuration. Users should + ## install an updated NVIDIA display driver to allow the application to run. + ## + cudaErrorInsufficientDriver = 35, ## + ## This indicates that the API call requires a newer CUDA driver than the one + ## currently installed. Users should install an updated NVIDIA CUDA driver + ## to allow the API call to succeed. + ## + cudaErrorCallRequiresNewerDriver = 36, ## + ## This indicates that the surface passed to the API call is not a valid + ## surface. + ## + cudaErrorInvalidSurface = 37, ## + ## This indicates that multiple global or constant variables (across separate + ## CUDA source files in the application) share the same string name. + ## + cudaErrorDuplicateVariableName = 43, ## + ## This indicates that multiple textures (across separate CUDA source + ## files in the application) share the same string name. + ## + cudaErrorDuplicateTextureName = 44, ## + ## This indicates that multiple surfaces (across separate CUDA source + ## files in the application) share the same string name. + ## + cudaErrorDuplicateSurfaceName = 45, ## + ## This indicates that all CUDA devices are busy or unavailable at the current + ## time. Devices are often busy/unavailable due to use of + ## ::cudaComputeModeProhibited, ::cudaComputeModeExclusiveProcess, or when long + ## running CUDA kernels have filled up the GPU and are blocking new work + ## from starting. They can also be unavailable due to memory constraints + ## on a device that already has active CUDA work being performed. + ## + cudaErrorDevicesUnavailable = 46, ## + ## This indicates that the current context is not compatible with this + ## the CUDA Runtime. This can only occur if you are using CUDA + ## Runtime/Driver interoperability and have created an existing Driver + ## context using the driver API. The Driver context may be incompatible + ## either because the Driver context was created using an older version + ## of the API, because the Runtime API call expects a primary driver + ## context and the Driver context is not primary, or because the Driver + ## context has been destroyed. Please see \ref CUDART_DRIVER "Interactions + ## with the CUDA Driver API" for more information. + ## + cudaErrorIncompatibleDriverContext = 49, ## + ## The device function being invoked (usually via ::cudaLaunchKernel()) was not + ## previously configured via the ::cudaConfigureCall() function. + ## + cudaErrorMissingConfiguration = 52, ## + ## This indicated that a previous kernel launch failed. This was previously + ## used for device emulation of kernel launches. + ## \deprecated + ## This error return is deprecated as of CUDA 3.1. Device emulation mode was + ## removed with the CUDA 3.1 release. + ## + cudaErrorPriorLaunchFailure = 53, ## + ## This error indicates that a device runtime grid launch did not occur + ## because the depth of the child grid would exceed the maximum supported + ## number of nested grid launches. + ## + cudaErrorLaunchMaxDepthExceeded = 65, ## + ## This error indicates that a grid launch did not occur because the kernel + ## uses file-scoped textures which are unsupported by the device runtime. + ## Kernels launched via the device runtime only support textures created with + ## the Texture Object API's. + ## + cudaErrorLaunchFileScopedTex = 66, ## + ## This error indicates that a grid launch did not occur because the kernel + ## uses file-scoped surfaces which are unsupported by the device runtime. + ## Kernels launched via the device runtime only support surfaces created with + ## the Surface Object API's. + ## + cudaErrorLaunchFileScopedSurf = 67, ## + ## This error indicates that a call to ::cudaDeviceSynchronize made from + ## the device runtime failed because the call was made at grid depth greater + ## than than either the default (2 levels of grids) or user specified device + ## limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on + ## launched grids at a greater depth successfully, the maximum nested + ## depth at which ::cudaDeviceSynchronize will be called must be specified + ## with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit + ## api before the host-side launch of a kernel using the device runtime. + ## Keep in mind that additional levels of sync depth require the runtime + ## to reserve large amounts of device memory that cannot be used for + ## user allocations. Note that ::cudaDeviceSynchronize made from device + ## runtime is only supported on devices of compute capability < 9.0. + ## + cudaErrorSyncDepthExceeded = 68, ## + ## This error indicates that a device runtime grid launch failed because + ## the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount. + ## For this launch to proceed successfully, ::cudaDeviceSetLimit must be + ## called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher + ## than the upper bound of outstanding launches that can be issued to the + ## device runtime. Keep in mind that raising the limit of pending device + ## runtime launches will require the runtime to reserve device memory that + ## cannot be used for user allocations. + ## + cudaErrorLaunchPendingCountExceeded = 69, ## + ## The requested device function does not exist or is not compiled for the + ## proper device architecture. + ## + cudaErrorInvalidDeviceFunction = 98, ## + ## This indicates that no CUDA-capable devices were detected by the installed + ## CUDA driver. + ## + cudaErrorNoDevice = 100, ## + ## This indicates that the device ordinal supplied by the user does not + ## correspond to a valid CUDA device or that the action requested is + ## invalid for the specified device. + ## + cudaErrorInvalidDevice = 101, ## + ## This indicates that the device doesn't have a valid Grid License. + ## + cudaErrorDeviceNotLicensed = 102, ## + ## By default, the CUDA runtime may perform a minimal set of self-tests, + ## as well as CUDA driver tests, to establish the validity of both. + ## Introduced in CUDA 11.2, this error return indicates that at least one + ## of these tests has failed and the validity of either the runtime + ## or the driver could not be established. + ## + cudaErrorSoftwareValidityNotEstablished = 103, ## + ## This indicates an internal startup failure in the CUDA runtime. + ## + cudaErrorStartupFailure = 127, ## + ## This indicates that the device kernel image is invalid. + ## + cudaErrorInvalidKernelImage = 200, ## + ## This most frequently indicates that there is no context bound to the + ## current thread. This can also be returned if the context passed to an + ## API call is not a valid handle (such as a context that has had + ## ::cuCtxDestroy() invoked on it). This can also be returned if a user + ## mixes different API versions (i.e. 3010 context with 3020 API calls). + ## See ::cuCtxGetApiVersion() for more details. + ## + cudaErrorDeviceUninitialized = 201, ## + ## This indicates that the buffer object could not be mapped. + ## + cudaErrorMapBufferObjectFailed = 205, ## + ## This indicates that the buffer object could not be unmapped. + ## + cudaErrorUnmapBufferObjectFailed = 206, ## + ## This indicates that the specified array is currently mapped and thus + ## cannot be destroyed. + ## + cudaErrorArrayIsMapped = 207, ## + ## This indicates that the resource is already mapped. + ## + cudaErrorAlreadyMapped = 208, ## + ## This indicates that there is no kernel image available that is suitable + ## for the device. This can occur when a user specifies code generation + ## options for a particular CUDA source file that do not include the + ## corresponding device configuration. + ## + cudaErrorNoKernelImageForDevice = 209, ## + ## This indicates that a resource has already been acquired. + ## + cudaErrorAlreadyAcquired = 210, ## + ## This indicates that a resource is not mapped. + ## + cudaErrorNotMapped = 211, ## + ## This indicates that a mapped resource is not available for access as an + ## array. + ## + cudaErrorNotMappedAsArray = 212, ## + ## This indicates that a mapped resource is not available for access as a + ## pointer. + ## + cudaErrorNotMappedAsPointer = 213, ## + ## This indicates that an uncorrectable ECC error was detected during + ## execution. + ## + cudaErrorECCUncorrectable = 214, ## + ## This indicates that the ::cudaLimit passed to the API call is not + ## supported by the active device. + ## + cudaErrorUnsupportedLimit = 215, ## + ## This indicates that a call tried to access an exclusive-thread device that + ## is already in use by a different thread. + ## + cudaErrorDeviceAlreadyInUse = 216, ## + ## This error indicates that P2P access is not supported across the given + ## devices. + ## + cudaErrorPeerAccessUnsupported = 217, ## + ## A PTX compilation failed. The runtime may fall back to compiling PTX if + ## an application does not contain a suitable binary for the current device. + ## + cudaErrorInvalidPtx = 218, ## + ## This indicates an error with the OpenGL or DirectX context. + ## + cudaErrorInvalidGraphicsContext = 219, ## + ## This indicates that an uncorrectable NVLink error was detected during the + ## execution. + ## + cudaErrorNvlinkUncorrectable = 220, ## + ## This indicates that the PTX JIT compiler library was not found. The JIT Compiler + ## library is used for PTX compilation. The runtime may fall back to compiling PTX + ## if an application does not contain a suitable binary for the current device. + ## + cudaErrorJitCompilerNotFound = 221, ## + ## This indicates that the provided PTX was compiled with an unsupported toolchain. + ## The most common reason for this, is the PTX was generated by a compiler newer + ## than what is supported by the CUDA driver and PTX JIT compiler. + ## + cudaErrorUnsupportedPtxVersion = 222, ## + ## This indicates that the JIT compilation was disabled. The JIT compilation compiles + ## PTX. The runtime may fall back to compiling PTX if an application does not contain + ## a suitable binary for the current device. + ## + cudaErrorJitCompilationDisabled = 223, ## + ## This indicates that the provided execution affinity is not supported by the device. + ## + cudaErrorUnsupportedExecAffinity = 224, ## + ## This indicates that the code to be compiled by the PTX JIT contains + ## unsupported call to cudaDeviceSynchronize. + ## + cudaErrorUnsupportedDevSideSync = 225, ## + ## This indicates that the device kernel source is invalid. + ## + cudaErrorInvalidSource = 300, ## + ## This indicates that the file specified was not found. + ## + cudaErrorFileNotFound = 301, ## + ## This indicates that a link to a shared object failed to resolve. + ## + cudaErrorSharedObjectSymbolNotFound = 302, ## + ## This indicates that initialization of a shared object failed. + ## + cudaErrorSharedObjectInitFailed = 303, ## + ## This error indicates that an OS call failed. + ## + cudaErrorOperatingSystem = 304, ## + ## This indicates that a resource handle passed to the API call was not + ## valid. Resource handles are opaque types like ::cudaStream_t and + ## ::cudaEvent_t. + ## + cudaErrorInvalidResourceHandle = 400, ## + ## This indicates that a resource required by the API call is not in a + ## valid state to perform the requested operation. + ## + cudaErrorIllegalState = 401, ## + ## This indicates an attempt was made to introspect an object in a way that + ## would discard semantically important information. This is either due to + ## the object using funtionality newer than the API version used to + ## introspect it or omission of optional return arguments. + ## + cudaErrorLossyQuery = 402, ## + ## This indicates that a named symbol was not found. Examples of symbols + ## are global/constant variable names, driver function names, texture names, + ## and surface names. + ## + cudaErrorSymbolNotFound = 500, ## + ## This indicates that asynchronous operations issued previously have not + ## completed yet. This resultNotKeyWord is not actually an error, but must be indicated + ## differently than ::cudaSuccess (which indicates completion). Calls that + ## may return this value include ::cudaEventQuery() and ::cudaStreamQuery(). + ## + cudaErrorNotReady = 600, ## + ## The device encountered a load or store instruction on an invalid memory address. + ## This leaves the process in an inconsistent state and any further CUDA work + ## will return the same error. To continue using CUDA, the process must be terminated + ## and relaunched. + ## + cudaErrorIllegalAddress = 700, ## + ## This indicates that a launch did not occur because it did not have + ## appropriate resources. Although this error is similar to + ## ::cudaErrorInvalidConfiguration, this error usually indicates that the + ## user has attempted to pass too many arguments to the device kernel, or the + ## kernel launch specifies too many threads for the kernel's register count. + ## + cudaErrorLaunchOutOfResources = 701, ## + ## This indicates that the device kernel took too long to execute. This can + ## only occur if timeouts are enabled - see the device property + ## \ref + ## ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled" + ## for more information. + ## This leaves the process in an inconsistent state and any further CUDA work + ## will return the same error. To continue using CUDA, the process must be terminated + ## and relaunched. + ## + cudaErrorLaunchTimeout = 702, ## + ## This error indicates a kernel launch that uses an incompatible texturing + ## mode. + ## + cudaErrorLaunchIncompatibleTexturing = 703, ## + ## This error indicates that a call to ::cudaDeviceEnablePeerAccess() is + ## trying to re-enable peer addressing on from a context which has already + ## had peer addressing enabled. + ## + cudaErrorPeerAccessAlreadyEnabled = 704, ## + ## This error indicates that ::cudaDeviceDisablePeerAccess() is trying to + ## disable peer addressing which has not been enabled yet via + ## ::cudaDeviceEnablePeerAccess(). + ## + cudaErrorPeerAccessNotEnabled = 705, ## + ## This indicates that the user has called ::cudaSetValidDevices(), + ## ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), + ## ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or + ## ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by + ## calling non-device management operations (allocating memory and + ## launching kernels are examples of non-device management operations). + ## This error can also be returned if using runtime/driver + ## interoperability and there is an existing ::CUcontext active on the + ## host thread. + ## + cudaErrorSetOnActiveProcess = 708, ## + ## This error indicates that the context current to the calling thread + ## has been destroyed using ::cuCtxDestroy, or is a primary context which + ## has not yet been initialized. + ## + cudaErrorContextIsDestroyed = 709, ## + ## An assert triggered in device code during kernel execution. The device + ## cannot be used again. All existing allocations are invalid. To continue + ## using CUDA, the process must be terminated and relaunched. + ## + cudaErrorAssert = 710, ## + ## This error indicates that the hardware resources required to enable + ## peer access have been exhausted for one or more of the devices + ## passed to ::cudaEnablePeerAccess(). + ## + cudaErrorTooManyPeers = 711, ## + ## This error indicates that the memory range passed to ::cudaHostRegister() + ## has already been registered. + ## + cudaErrorHostMemoryAlreadyRegistered = 712, ## + ## This error indicates that the pointer passed to ::cudaHostUnregister() + ## does not correspond to any currently registered memory region. + ## + cudaErrorHostMemoryNotRegistered = 713, ## + ## Device encountered an error in the call stack during kernel execution, + ## possibly due to stack corruption or exceeding the stack size limit. + ## This leaves the process in an inconsistent state and any further CUDA work + ## will return the same error. To continue using CUDA, the process must be terminated + ## and relaunched. + ## + cudaErrorHardwareStackError = 714, ## + ## The device encountered an illegal instruction during kernel execution + ## This leaves the process in an inconsistent state and any further CUDA work + ## will return the same error. To continue using CUDA, the process must be terminated + ## and relaunched. + ## + cudaErrorIllegalInstruction = 715, ## + ## The device encountered a load or store instruction + ## on a memory address which is not aligned. + ## This leaves the process in an inconsistent state and any further CUDA work + ## will return the same error. To continue using CUDA, the process must be terminated + ## and relaunched. + ## + cudaErrorMisalignedAddress = 716, ## + ## While executing a kernel, the device encountered an instruction + ## which can only operate on memory locations in certain address spaces + ## (global, shared, or local), but was supplied a memory address not + ## belonging to an allowed address space. + ## This leaves the process in an inconsistent state and any further CUDA work + ## will return the same error. To continue using CUDA, the process must be terminated + ## and relaunched. + ## + cudaErrorInvalidAddressSpace = 717, ## + ## The device encountered an invalid program counter. + ## This leaves the process in an inconsistent state and any further CUDA work + ## will return the same error. To continue using CUDA, the process must be terminated + ## and relaunched. + ## + cudaErrorInvalidPc = 718, ## + ## An exception occurred on the device while executing a kernel. Common + ## causes include dereferencing an invalid device pointer and accessing + ## out of bounds shared memory. Less common cases can be system specific - more + ## information about these cases can be found in the system specific user guide. + ## This leaves the process in an inconsistent state and any further CUDA work + ## will return the same error. To continue using CUDA, the process must be terminated + ## and relaunched. + ## + cudaErrorLaunchFailure = 719, ## + ## This error indicates that the number of blocks launched per grid for a kernel that was + ## launched via either ::cudaLaunchCooperativeKernel or ::cudaLaunchCooperativeKernelMultiDevice + ## exceeds the maximum number of blocks as allowed by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + ## or + ## ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors + ## as specified by the device attribute ::cudaDevAttrMultiProcessorCount. + ## + cudaErrorCooperativeLaunchTooLarge = 720, ## + ## This error indicates the attempted operation is not permitted. + ## + cudaErrorNotPermitted = 800, ## + ## This error indicates the attempted operation is not supported + ## on the current system or device. + ## + cudaErrorNotSupported = 801, ## + ## This error indicates that the system is not yet ready to start any CUDA + ## work. To continue using CUDA, verify the system configuration is in a + ## valid state and all required driver daemons are actively running. + ## More information about this error can be found in the system specific + ## user guide. + ## + cudaErrorSystemNotReady = 802, ## + ## This error indicates that there is a mismatch between the versions of + ## the display driver and the CUDA driver. Refer to the compatibility documentation + ## for supported versions. + ## + cudaErrorSystemDriverMismatch = 803, ## + ## This error indicates that the system was upgraded to run with forward compatibility + ## but the visible hardware detected by CUDA does not support this configuration. + ## Refer to the compatibility documentation for the supported hardware matrix or ensure + ## that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES + ## environment variable. + ## + cudaErrorCompatNotSupportedOnDevice = 804, ## + ## This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. + ## + cudaErrorMpsConnectionFailed = 805, ## + ## This error indicates that the remote procedural call between the MPS server and the MPS client failed. + ## + cudaErrorMpsRpcFailure = 806, ## + ## This error indicates that the MPS server is not ready to accept new MPS client requests. + ## This error can be returned when the MPS server is in the process of recovering from a fatal failure. + ## + cudaErrorMpsServerNotReady = 807, ## + ## This error indicates that the hardware resources required to create MPS client have been exhausted. + ## + cudaErrorMpsMaxClientsReached = 808, ## + ## This error indicates the the hardware resources required to device connections have been exhausted. + ## + cudaErrorMpsMaxConnectionsReached = 809, ## + ## This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched. + ## + cudaErrorMpsClientTerminated = 810, ## + ## This error indicates, that the program is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it. + ## + cudaErrorCdpNotSupported = 811, ## + ## This error indicates, that the program contains an unsupported interaction between different versions of CUDA Dynamic Parallelism. + ## + cudaErrorCdpVersionMismatch = 812, ## + ## The operation is not permitted when the stream is capturing. + ## + cudaErrorStreamCaptureUnsupported = 900, ## + ## The current capture sequence on the stream has been invalidated due to + ## a previous error. + ## + cudaErrorStreamCaptureInvalidated = 901, ## + ## The operation would have resulted in a merge of two independent capture + ## sequences. + ## + cudaErrorStreamCaptureMerge = 902, ## + ## The capture was not initiated in this stream. + ## + cudaErrorStreamCaptureUnmatched = 903, ## + ## The capture sequence contains a fork that was not joined to the primary + ## stream. + ## + cudaErrorStreamCaptureUnjoined = 904, ## + ## A dependency would have been created which crosses the capture sequence + ## boundary. Only implicit in-stream ordering dependencies are allowed to + ## cross the boundary. + ## + cudaErrorStreamCaptureIsolation = 905, ## + ## The operation would have resulted in a disallowed implicit dependency on + ## a current capture sequence from cudaStreamLegacy. + ## + cudaErrorStreamCaptureImplicit = 906, ## + ## The operation is not permitted on an event which was last recorded in a + ## capturing stream. + ## + cudaErrorCapturedEvent = 907, ## + ## A stream capture sequence not initiated with the ::cudaStreamCaptureModeRelaxed + ## argument to ::cudaStreamBeginCapture was passed to ::cudaStreamEndCapture in a + ## different thread. + ## + cudaErrorStreamCaptureWrongThread = 908, ## + ## This indicates that the wait operation has timed out. + ## + cudaErrorTimeout = 909, ## + ## This error indicates that the graph update was not performed because it included + ## changes which violated constraints specific to instantiated graph update. + ## + cudaErrorGraphExecUpdateFailure = 910, ## + ## This indicates that an async error has occurred in a device outside of CUDA. + ## If CUDA was waiting for an external device's signal before consuming shared data, + ## the external device signaled an error indicating that the data is not valid for + ## consumption. This leaves the process in an inconsistent state and any further CUDA + ## work will return the same error. To continue using CUDA, the process must be + ## terminated and relaunched. + ## + cudaErrorExternalDevice = 911, ## + ## This indicates that a kernel launch error has occurred due to cluster + ## misconfiguration. + ## + cudaErrorInvalidClusterSize = 912, ## + ## This indicates that an unknown internal error has occurred. + ## + cudaErrorUnknown = 999, ## + ## Any unhandled CUDA driver error is added to this value and returned via + ## the runtime. Production releases of CUDA should not return such errors. + ## \deprecated + ## This error return is deprecated as of CUDA 4.1. + ## + cudaErrorApiFailureBase = 10000 + + cudaError_t* = cudaError + + CUstream_st = object + cudaStream_t* = ptr CUstream_st + + CUevent_st = object + cudaEvent_t* = ptr CUevent_st + + + CUuuid_st* {.bycopy.} = object + ## < CUDA definition of UUID + bytes*: array[16, char] + + CUuuid* = CUuuid_st + cudaUUID_t* = CUuuid_st + + cudaDeviceProp* {.bycopy.} = object + name*: array[256, char] + ## < ASCII string identifying device + uuid*: cudaUUID_t + ## < 16-byte unique identifier + luid*: array[8, char] + ## < 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms + luidDeviceNodeMask*: cuint + ## < LUID device node mask. Value is undefined on TCC and non-Windows platforms + totalGlobalMem*: csize_t + ## < Global memory available on device in bytes + sharedMemPerBlock*: csize_t + ## < Shared memory available per block in bytes + regsPerBlock*: cint + ## < 32-bit registers available per block + warpSize*: cint + ## < Warp size in threads + memPitch*: csize_t + ## < Maximum pitch in bytes allowed by memory copies + maxThreadsPerBlock*: cint + ## < Maximum number of threads per block + maxThreadsDim*: array[3, cint] + ## < Maximum size of each dimension of a block + maxGridSize*: array[3, cint] + ## < Maximum size of each dimension of a grid + clockRate*: cint + ## < Deprecated, Clock frequency in kilohertz + totalConstMem*: csize_t + ## < Constant memory available on device in bytes + major*: cint + ## < Major compute capability + minor*: cint + ## < Minor compute capability + textureAlignment*: csize_t + ## < Alignment requirement for textures + texturePitchAlignment*: csize_t + ## < Pitch alignment requirement for texture references bound to pitched memory + deviceOverlap*: cint + ## < Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. + multiProcessorCount*: cint + ## < Number of multiprocessors on device + kernelExecTimeoutEnabled*: cint + ## < Deprecated, Specified whether there is a run time limit on kernels + integrated*: cint + ## < Device is integrated as opposed to discrete + canMapHostMemory*: cint + ## < Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer + computeMode*: cint + ## < Deprecated, Compute mode (See ::cudaComputeMode) + maxTexture1D*: cint + ## < Maximum 1D texture size + maxTexture1DMipmap*: cint + ## < Maximum 1D mipmapped texture size + maxTexture1DLinear*: cint + ## < Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. + maxTexture2D*: array[2, cint] + ## < Maximum 2D texture dimensions + maxTexture2DMipmap*: array[2, cint] + ## < Maximum 2D mipmapped texture dimensions + maxTexture2DLinear*: array[3, cint] + ## < Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory + maxTexture2DGather*: array[2, cint] + ## < Maximum 2D texture dimensions if texture gather operations have to be performed + maxTexture3D*: array[3, cint] + ## < Maximum 3D texture dimensions + maxTexture3DAlt*: array[3, cint] + ## < Maximum alternate 3D texture dimensions + maxTextureCubemap*: cint + ## < Maximum Cubemap texture dimensions + maxTexture1DLayered*: array[2, cint] + ## < Maximum 1D layered texture dimensions + maxTexture2DLayered*: array[3, cint] + ## < Maximum 2D layered texture dimensions + maxTextureCubemapLayered*: array[2, cint] + ## < Maximum Cubemap layered texture dimensions + maxSurface1D*: cint + ## < Maximum 1D surface size + maxSurface2D*: array[2, cint] + ## < Maximum 2D surface dimensions + maxSurface3D*: array[3, cint] + ## < Maximum 3D surface dimensions + maxSurface1DLayered*: array[2, cint] + ## < Maximum 1D layered surface dimensions + maxSurface2DLayered*: array[3, cint] + ## < Maximum 2D layered surface dimensions + maxSurfaceCubemap*: cint + ## < Maximum Cubemap surface dimensions + maxSurfaceCubemapLayered*: array[2, cint] + ## < Maximum Cubemap layered surface dimensions + surfaceAlignment*: csize_t + ## < Alignment requirements for surfaces + concurrentKernels*: cint + ## < Device can possibly execute multiple kernels concurrently + ECCEnabled*: cint + ## < Device has ECC support enabled + pciBusID*: cint + ## < PCI bus ID of the device + pciDeviceID*: cint + ## < PCI device ID of the device + pciDomainID*: cint + ## < PCI domain ID of the device + tccDriver*: cint + ## < 1 if device is a Tesla device using TCC driver, 0 otherwise + asyncEngineCount*: cint + ## < Number of asynchronous engines + unifiedAddressing*: cint + ## < Device shares a unified address space with the host + memoryClockRate*: cint + ## < Deprecated, Peak memory clock frequency in kilohertz + memoryBusWidth*: cint + ## < Global memory bus width in bits + l2CacheSize*: cint + ## < Size of L2 cache in bytes + persistingL2CacheMaxSize*: cint + ## < Device's maximum l2 persisting lines capacity setting in bytes + maxThreadsPerMultiProcessor*: cint + ## < Maximum resident threads per multiprocessor + streamPrioritiesSupported*: cint + ## < Device supports stream priorities + globalL1CacheSupported*: cint + ## < Device supports caching globals in L1 + localL1CacheSupported*: cint + ## < Device supports caching locals in L1 + sharedMemPerMultiprocessor*: csize_t + ## < Shared memory available per multiprocessor in bytes + regsPerMultiprocessor*: cint + ## < 32-bit registers available per multiprocessor + managedMemory*: cint + ## < Device supports allocating managed memory on this system + isMultiGpuBoard*: cint + ## < Device is on a multi-GPU board + multiGpuBoardGroupID*: cint + ## < Unique identifier for a group of devices on the same multi-GPU board + hostNativeAtomicSupported*: cint + ## < Link between the device and the host supports native atomic operations + singleToDoublePrecisionPerfRatio*: cint + ## < Deprecated, Ratio of single precision performance (in floating-point operations per second) to double precision performance + pageableMemoryAccess*: cint + ## < Device supports coherently accessing pageable memory without calling cudaHostRegister on it + concurrentManagedAccess*: cint + ## < Device can coherently access managed memory concurrently with the CPU + computePreemptionSupported*: cint + ## < Device supports Compute Preemption + canUseHostPointerForRegisteredMem*: cint + ## < Device can access host registered memory at the same virtual address as the CPU + cooperativeLaunch*: cint + ## < Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel + cooperativeMultiDeviceLaunch*: cint + ## < Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. + sharedMemPerBlockOptin*: csize_t + ## < Per device maximum shared memory per block usable by special opt in + pageableMemoryAccessUsesHostPageTables*: cint + ## < Device accesses pageable memory via the host's page tables + directManagedMemAccessFromHost*: cint + ## < Host can directly access managed memory on the device without migration. + maxBlocksPerMultiProcessor*: cint + ## < Maximum number of resident blocks per multiprocessor + accessPolicyMaxWindowSize*: cint + ## < The maximum value of ::cudaAccessPolicyWindow::num_bytes. + reservedSharedMemPerBlock*: csize_t + ## < Shared memory reserved by CUDA driver per block in bytes + hostRegisterSupported*: cint + ## < Device supports host memory registration via ::cudaHostRegister. + sparseCudaArraySupported*: cint + ## < 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays, 0 otherwise + hostRegisterReadOnlySupported*: cint + ## < Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU + timelineSemaphoreInteropSupported*: cint + ## < External timeline semaphore interop is supported on the device + memoryPoolsSupported*: cint + ## < 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, 0 otherwise + gpuDirectRDMASupported*: cint + ## < 1 if the device supports GPUDirect RDMA APIs, 0 otherwise + gpuDirectRDMAFlushWritesOptions*: cuint + ## < Bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum + gpuDirectRDMAWritesOrdering*: cint + ## < See the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values + memoryPoolSupportedHandleTypes*: cuint + ## < Bitmask of handle types supported with mempool-based IPC + deferredMappingCudaArraySupported*: cint + ## < 1 if the device supports deferred mapping CUDA arrays and CUDA mipmapped arrays + ipcEventSupported*: cint + ## < Device supports IPC Events. + clusterLaunch*: cint + ## < Indicates device supports cluster launch + unifiedFunctionPointers*: cint + ## < Indicates device supports unified pointers + reserved2*: array[2, cint] + reserved1*: array[1, cint] + ## < Reserved for future use + reserved*: array[60, cint] + ## < Reserved for future use + + ## + ## CUDA memory copy types + ## + cudaMemcpyKind* = enum + cudaMemcpyHostToHost = 0, ## < Host -> Host + cudaMemcpyHostToDevice = 1, ## < Host -> Device + cudaMemcpyDeviceToHost = 2, ## < Device -> Host + cudaMemcpyDeviceToDevice = 3, ## < Device -> Device + cudaMemcpyDefault = 4 ## < Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing + + +proc cudaRuntimeGetVersion*(runtimeVersion: ptr cint): cudaError_t {.cdecl, + importc: "cudaRuntimeGetVersion", dynlib: libCudaRT.} + +proc cudaGetDeviceProperties*(prop: ptr cudaDeviceProp; device: cint): cudaError_t {. + cdecl, importc: "cudaGetDeviceProperties", dynlib: libCudaRT.} + +proc cudaEventCreate*(event: ptr cudaEvent_t): cudaError_t {.cdecl, + importc: "cudaEventCreate", dynlib: libCudaRT.} + +proc cudaEventRecord*(event: cudaEvent_t; stream: cudaStream_t): cudaError_t {. + cdecl, importc: "cudaEventRecord", dynlib: libCudaRT.} + +proc cudaEventSynchronize*(event: cudaEvent_t): cudaError_t {.cdecl, + importc: "cudaEventSynchronize", dynlib: libCudaRT.} + +proc cudaDeviceSynchronize*(): cudaError_t {.cdecl, + importc: "cudaDeviceSynchronize", dynlib: libCudaRT.} + +proc cudaEventElapsedTime*(ms: ptr cfloat; start: cudaEvent_t; `end`: cudaEvent_t): cudaError_t {. + cdecl, importc: "cudaEventElapsedTime", dynlib: libCudaRT.} + +proc cudaEventDestroy*(event: cudaEvent_t): cudaError_t {.cdecl, + importc: "cudaEventDestroy", dynlib: libCudaRT.} + +proc cudaMemcpyToSymbol*(symbol: pointer, + src: pointer, + count, offset: csize_t, + kind: cudaMemcpyKind = cudaMemcpyHostToDevice): cudaError_t {. + cdecl, importc: "cudaMemcpyToSymbol", dynlib: libCudaRT.} + +proc cudaGetSymbolAddress*(devPtr: ptr CUdeviceptr, + symbol: pointer): cudaError_t {.cdecl, + importc: "cudaGetSymbolAddress", dynlib: libCudaRT.} + + + +###################################################################### +################################ Utilities ########################### +###################################################################### + + +template check*(status: CUresult, quitOnFailure = true) = + ## Check the status code of a CUDA operation + ## Exit program with error if failure + + let code = status # ensure that the input expression is evaluated once only + if code != CUDA_SUCCESS: + writeStackTrace() + stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n') + if quitOnFailure: + quit 1 + +template check*(a: sink nvrtcResult, quitOnFailure = true) = + let code = a + if code != NVRTC_SUCCESS: + writeStackTrace() + stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n') + if quitOnFailure: + quit 1 + +template check*(a: sink cudaError_t, quitOnFailure = true) = + let code = a + if code != cudaSuccess: + writeStackTrace() + stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n') + if quitOnFailure: + quit 1 diff --git a/constantine/platforms/static_for.nim b/constantine/platforms/static_for.nim index c83ba3630..6f9179fdd 100644 --- a/constantine/platforms/static_for.nim +++ b/constantine/platforms/static_for.nim @@ -34,6 +34,14 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped ident("unrolledIter_" & $idx & $i), body.replaceNodes(idx, newLit i)) +macro staticForStepped*(idx: untyped{nkIdent}, start, stopEx, increment: static int, body: untyped): untyped = + ## Version of `staticFor` which takes an increment != 1. + result = newStmtList() + for i in countup(start, stopEx - increment, increment): + result.add nnkBlockStmt.newTree( + ident("unrolledIter_" & $idx & $i), + body.replaceNodes(idx, newLit i)) + macro staticForCountdown*(idx: untyped{nkIdent}, start, stopIncl: static int, body: untyped): untyped = result = newStmtList() for i in countdown(start, stopIncl): diff --git a/tests/gpu/t_mul.nim b/tests/gpu/t_mul.nim index d93538545..9c17362a4 100644 --- a/tests/gpu/t_mul.nim +++ b/tests/gpu/t_mul.nim @@ -42,4 +42,4 @@ testName(Fp[BN254_Snarks], 32, a, b) # We get incorrect result for modular multiplication with 64-bit limbs due to a fused-multiuply-add with carry bug. # # - https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py -# - https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067 \ No newline at end of file +# - https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067 diff --git a/tests/gpu/t_nvrtc_bigint_example.nim b/tests/gpu/t_nvrtc_bigint_example.nim new file mode 100644 index 000000000..8a457598a --- /dev/null +++ b/tests/gpu/t_nvrtc_bigint_example.nim @@ -0,0 +1,252 @@ +import std / strformat + +import + # Internal + constantine/named/algebras, + constantine/math/io/[io_bigints, io_fields], + constantine/math/arithmetic, + constantine/platforms/abstractions {.all.}, + constantine/platforms/abis/nvidia_abi, + constantine/math_compiler/experimental/runtime_compile, + constantine/serialization/io_limbs + +type T = Fp[BN254_Snarks] +const WordSize = 32 +# Example showing warp behavior with different thread counts +const BigIntExample = cuda: + # Utility for add with carry operations + type + BigInt = object + limbs: array[8, uint32] + template `[]`(x: BigInt, idx: int): untyped = x.limbs[idx] + template `[]=`(x: BigInt, idx: int, val: uint32): untyped = x.limbs[idx] = val + template len(x: BigInt): int = 8 # static: BigInt().limbs.len + + # Need to get the limbs & spare bits data in a static context + template getFieldModulus(): untyped = static: T.getModulus().limbs + template spareBits(): untyped = static: (BigInt().limbs.len * WordSize - T.bits()) + + ## Note: the below would just be generated from a macro of course, similar to + ## `constantine/platforms/llvm/asm_nvidia.nim`. + proc add_cio(a, b: uint32): uint32 {.device, forceinline.} = + var res: uint32 + asm """ +"addc.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc add_ci(a, b: uint32): uint32 {.device, forceinline.} = + var res: uint32 + asm """ +"addc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc add_co(a, b: uint32): uint32 {.device, forceinline.} = + var res: uint32 + asm """ +"add.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc sub_bo(a, b: uint32): uint32 {.device, forceinline.} = + var res: uint32 + asm """ +"sub.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc sub_bi(a, b: uint32): uint32 {.device, forceinline.} = + var res: uint32 + asm """ +"subc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc sub_bio(a, b: uint32): uint32 {.device, forceinline.} = + var res: uint32 + asm """ +"subc.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b) +""" + return res + + proc slct(a, b: uint32, pred: uint32): uint32 {.device, forceinline.} = + var res: uint32 +# "slct.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(pred) + asm """ +"slct.u32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(pred) +""" + return res + + proc finalSubMayOverflow(a, M: BigInt): BigInt {.device.} = + ## If a >= Modulus: r <- a-M + ## else: r <- a + ## + ## This is constant-time straightline code. + ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU. + ## + ## To be used when the final substraction can + ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256) + let N = a.len + var scratch: BigInt + + # Contains 0x0001 (if overflowed limbs) or 0x0000 + let overflowedLimbs = add_ci(0'u32, 0'u32) + + # Now substract the modulus, and test a < M with the last borrow + scratch[0] = sub_bo(a[0], M[0]) + for i in 1 ..< N: + scratch[i] = sub_bio(a[i], M[i]) + + # 1. if `overflowedLimbs`, underflowedModulus >= 0 + # 2. if a >= M, underflowedModulus >= 0 + # if underflowedModulus >= 0: a-M else: a + # TODO: predicated mov instead? + let underflowedModulus = sub_bi(overflowedLimbs, 0'u32) + + var r: BigInt + for i in 0 ..< N: + r[i] = slct(scratch[i], a[i], underflowedModulus) + return r + + proc finalSubNoOverflow(a, M: BigInt): BigInt {.device.} = + ## If a >= Modulus: r <- a-M + ## else: r <- a + ## + ## This is constant-time straightline code. + ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU. + ## + ## To be used when the modulus does not use the full bitwidth of the storing words + ## (say using 255 bits for the modulus out of 256 available in words) + let N = a.len + var scratch: BigInt + + # Now substract the modulus, and test a < M with the last borrow + scratch[0] = sub_bo(a[0], M[0]) + for i in 1 ..< N: + scratch[i] = sub_bio(a[i], M[i]) + + # If it underflows here, `a` was smaller than the modulus, which is what we want + let underflowedModulus = sub_bi(0'u32, 0'u32) + + var r: BigInt + for i in 0 ..< N: + r[i] = slct(scratch[i], a[i], underflowedModulus) + return r + + proc modadd(a, b, M: BigInt): BigInt {.device.} = + # try to add two bigints + let N = a.len + #var res: BigInt[N] + var res: BigInt + + var t: BigInt # temporary + + t[0] = add_co(a[0], b[0]) + for i in 1 ..< N: + t[i] = add_cio(a[i], b[i]) + printf("element i %d = %d\n", i, t[i]) + + # can use `when` of course! + when spareBits() >= 1: # if spareBits() >= 1: # would also work + t = finalSubNoOverflow(t, M) + else: + t = finalSubMayOverflow(t, M) + + return t + + #proc bigintTest(output: ptr UncheckedArray[uint32], aIn, bIn: ptr BigInt) {.global.} = + proc bigintTest(output: ptr UncheckedArray[uint32], a, b: BigInt) {.global.} = + # Get global thread ID for example + let tid = blockIdx.x * blockDim.x + threadIdx.x + # or warp ID and lane ID + let warp_id = threadIdx.x div 32 + let lane_id = threadIdx.x mod 32 + + ## Example: Construct via static array would work: + #let b = BigInt(limbs: [1'u32, 2, 3, 4, 5, 6, 7, 8]) + + ## If the bigints are passed as ptrs: + #let a = aIn[] + #let b = bIn[] + for i in 0 ..< 8: # print an input + printf("b: %d = %u\n", i, b[i]) + + let M64 = getFieldModulus() # need a let variable, otherwise modulus does not have an address + # Cast the 64bit limbs of field modulus to 32bit limbs to copy + var data = cast[ptr UncheckedArray[uint32]](addr M64[0]) + ## NOTE: you cannot do `BigInt(limbs: data)`. Leads to invalid C/CUDA code. We might turn calls + ## like that into memcpy in the future. + var M = BigInt() + ## Or copy data from a runtime array + for i in 0 ..< 8: + M[i] = data[i] + ## also works of course (in which case cast is not needed) + #memcpy(addr M[0], addr M64[0], sizeof(M64)) + + for i in 0 ..< 4: # let's print M64 as uint64 values + printf("M64: %d = %llu\n", i, M64[i]) + for i in 0 ..< 8: # let's print m64 as 32 bit data and assigned + printf("M: %d = %u\n", i, M[i]) + printf("M64 as 32: %d = %u\n", i, data[i]) + + # Call `modadd` and assign to result + let res = modadd(a, b, M) + for i in 0 ..< b.len: + output[i] = res[i] + +proc getBigints(): (Fp[BN254_Snarks], Fp[BN254_Snarks]) = + # return some bigint values of a finite field + let a = Fp[BN254_Snarks].fromUInt(1'u32) + let b = Fp[BN254_Snarks].fromHex("0x2beb0d0d6115007676f30bcc462fe814bf81198848f139621a3e9fa454fe8e6a") + + result = (a, b) + +proc main = + var nvrtc = initNvrtc(BigIntExample) + # echo the generated CUDA code + echo BigIntExample + + nvrtc.compile() + nvrtc.getPtx() + + var hOut: array[8, uint32] # storage for the output limbs (could also be a `Fp[BN254_Snarks]` instead) + nvrtc.numBlocks = 1 + nvrtc.threadsPerBlock = 1 + + let (a, b) = getBigInts() + + ## If arguments were `prt BigInt`: + #nvrtc.execute("bigintTest", (hOut), (addr a, addr b)) + # for regular `BigInt` arguments + nvrtc.execute("bigintTest", (hOut), (a, b)) + + ## Compare with expected + let exp = a + b + # Get expected as array of 8 uint32 + let expU32: array[8, uint32] = cast[ptr array[8, uint32]](exp.mres.limbs[0].addr)[] + + # both arrays must match + doAssert hOut == expU32 + + # now compare as field elements + # Things to note: + # - the `modadd` `hOut` data is in Montgomery representation + # - we need to convert `array[8, uint32]` into `array[32, byte]` + # to unmarshal into a `BigInt[254]` + # - need to undo Montgomery representation on the `BigInt[254]` before + # constructing the finite field element + var res: Fp[BN254_Snarks] + var resBI: matchingBigInt(BN254_Snarks) + var hOutBytes: array[32, byte] + hOutBytes.marshal(hOut, 32, littleEndian) # convert to 32 bytes + resBI.unmarshal(hOutBytes, littleEndian) # convert bytes to BigInt[254] + type T = Fp[BN254_Snarks] + # undo Montgomery representation + resBI.fromMont(resBI, T.getModulus(), T.getNegInvModWord(), T.getSpareBits()) + res.fromBig(resBI) # convert `BigInt[254]` to finite field element + + doAssert bool(res == exp) + +when isMainModule: + main() diff --git a/tests/gpu/t_nvrtc_inbuilt_modadd.nim b/tests/gpu/t_nvrtc_inbuilt_modadd.nim new file mode 100644 index 000000000..bdf8f87bd --- /dev/null +++ b/tests/gpu/t_nvrtc_inbuilt_modadd.nim @@ -0,0 +1,135 @@ +import std / strformat + +import + # Internal + constantine/named/algebras, + constantine/math/io/[io_bigints, io_fields], + constantine/math/arithmetic, + constantine/platforms/abstractions {.all.}, + constantine/platforms/abis/nvidia_abi, + constantine/math_compiler/experimental/runtime_compile, + constantine/serialization/io_limbs + +import constantine/math_compiler/experimental/nvrtc_field_ops + +const N = 8 +type T = Fp[BN254_Snarks] +const WordSize = 32 +const BigIntExample* = cuda: + + defBigInt(N) + defPtxHelpers() + defCoreFieldOps(T) + + #proc bigintTest(output: ptr UncheckedArray[uint32], aIn, bIn: ptr BigInt) {.global.} = + proc modaddTest(output: ptr UncheckedArray[uint32], a, b: BigInt) {.global.} = + let M64 = getFieldModulus() # need a let variable, otherwise modulus does not have an address + # Cast the 64bit limbs of field modulus to 32bit limbs to copy + var data = cast[ptr UncheckedArray[uint32]](addr M64[0]) + ## NOTE: you cannot do `BigInt(limbs: data)`. Leads to invalid C/CUDA code. We might turn calls + ## like that into memcpy in the future. + var M = BigInt() + ## Or copy data from a runtime array + for i in 0 ..< 8: + M[i] = data[i] + ## also works of course (in which case cast is not needed) + #memcpy(addr M[0], addr M64[0], sizeof(M64)) + + # Call `modadd` and assign to result + let res = modadd(a, b, M) + for i in 0 ..< b.len: + output[i] = res[i] + + proc modsubTest(output: ptr UncheckedArray[uint32], a, b: BigInt) {.global.} = + let M64 = getFieldModulus() # need a let variable, otherwise modulus does not have an address + var M = BigInt() + ## TODO: avoid this memcopy + memcpy(addr M[0], addr M64[0], sizeof(M64)) + + # Call `modadd` and assign to result + let res = modsub(a, b, M) + for i in 0 ..< b.len: + output[i] = res[i] + + proc mtymulTest(output: ptr UncheckedArray[uint32], a, b: BigInt) {.global.} = + let M64 = getFieldModulus() # need a let variable, otherwise modulus does not have an address + var M = BigInt() + ## TODO: avoid this memcopy + memcpy(addr M[0], addr M64[0], sizeof(M64)) + + # Call `modadd` and assign to result + let res = mtymul_CIOS_sparebit(a, b, M, true) + #let res = mtymul_CIOS_concise(a, b, M, true) + for i in 0 ..< b.len: + output[i] = res[i] + + +proc getBigints(): (Fp[BN254_Snarks], Fp[BN254_Snarks]) = + # return some bigint values of a finite field + let a = Fp[BN254_Snarks].fromUInt(1'u32) + let b = Fp[BN254_Snarks].fromHex("0x2beb0d0d6115007676f30bcc462fe814bf81198848f139621a3e9fa454fe8e6a") + #let b = Fp[BN254_Snarks].fromUint(1'u32) + + result = (a, b) + +template checkOp(kernel, exp, hOut, a, b: untyped): untyped = + nvrtc.execute(kernel, (hOut), (a, b)) + + ## Compare with expected + # Get expected as array of 8 uint32 + let expU32: array[8, uint32] = cast[ptr array[8, uint32]](exp.mres.limbs[0].addr)[] + + # both arrays must match + #doAssert hOut == expU32 + # now compare as field elements + # Things to note: + # - the `modadd` `hOut` data is in Montgomery representation + # - we need to convert `array[8, uint32]` into `array[32, byte]` + # to unmarshal into a `BigInt[254]` + # - need to undo Montgomery representation on the `BigInt[254]` before + # constructing the finite field element + var res: Fp[BN254_Snarks] + var resBI: matchingBigInt(BN254_Snarks) + var hOutBytes: array[32, byte] + hOutBytes.marshal(hOut, 32, littleEndian) # convert to 32 bytes + resBI.unmarshal(hOutBytes, littleEndian) # convert bytes to BigInt[254] + type T = Fp[BN254_Snarks] + # undo Montgomery representation + resBI.fromMont(resBI, T.getModulus(), T.getNegInvModWord(), T.getSpareBits()) + res.fromBig(resBI) # convert `BigInt[254]` to finite field element + + echo "Res = ", res.toHex() + echo "Exp = ", exp.toHex() + doAssert bool(res == exp) + +proc main = + var nvrtc = initNvrtc(BigIntExample) + # echo the generated CUDA code + echo BigIntExample + + nvrtc.compile() + nvrtc.getPtx() + + var hOut: array[8, uint32] # storage for the output limbs (could also be a `Fp[BN254_Snarks]` instead) + nvrtc.numBlocks = 1 + nvrtc.threadsPerBlock = 1 + + let (a, b) = getBigInts() + + ## If arguments were `prt BigInt`: + #nvrtc.execute("bigintTest", (hOut), (addr a, addr b)) + # for regular `BigInt` arguments + + echo "M0NINV::: ", Fp[BN254_Snarks].getModulus().negInvModWord() + echo "M0NINV::: ", Fp[BN254_Snarks].getModulus().negInvModWord().sizeof() + + let exp1 = a + b + checkOp("modaddTest", exp1, hOut, a, b) + let exp2 = a - b + checkOp("modsubTest", exp2, hOut, a, b) + let exp3 = a * b + checkOp("mtymulTest", exp3, hOut, a, b) + + +when isMainModule: + main() diff --git a/tests/gpu/t_nvrtc_misc_ops.nim b/tests/gpu/t_nvrtc_misc_ops.nim new file mode 100644 index 000000000..bea3943e0 --- /dev/null +++ b/tests/gpu/t_nvrtc_misc_ops.nim @@ -0,0 +1,379 @@ +import std / strformat + +import + # Internal + constantine/named/algebras, + constantine/math/io/[io_bigints, io_fields], + constantine/math/arithmetic, + constantine/math/arithmetic/bigints, + constantine/platforms/abstractions {.all.}, + constantine/platforms/abis/nvidia_abi, + constantine/math_compiler/experimental/runtime_compile, + constantine/serialization/io_limbs + +import constantine/math_compiler/experimental/nvrtc_field_ops + +proc toFp[Name: static Algebra](FF: type Fp[Name], ar: array[8, uint32]): Fp[Name] = + var resBI: matchingBigInt(BN254_Snarks) + var arBytes: array[32, byte] + arBytes.marshal(ar, 32, littleEndian) # convert to 32 bytes + resBI.unmarshal(arBytes, littleEndian) # convert bytes to BigInt[254] + # undo Montgomery representation + resBI.fromMont(resBI, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits()) + result.fromBig(resBI) # convert `BigInt[254]` to finite field element + +const N = 8 +type T = Fp[BN254_Snarks] +const WordSize = 32 +const BigIntExample* = cuda: + + defBigInt(N) + defPtxHelpers() + defCoreFieldOps(T) + + proc testPassBigInt(output: ptr BigInt) {.global.} = + # Call `modadd` and assign to result + for i in 0 ..< 8: + output[i] = i.uint32 + + proc testSetZero(output: ptr BigInt) {.global.} = + output[].setZero() + + proc testSetOne(output: ptr BigInt) {.global.} = + output[].setOne() + + proc testAdd(output: ptr BigInt, a, b: BigInt) {.global.} = + output[].add(a, b) + + proc testSub(output: ptr BigInt, a, b: BigInt) {.global.} = + output[].sub(a, b) + + proc testMul(output: ptr BigInt, a, b: BigInt) {.global.} = + output[].mul(a, b) + + proc testCcopy(output: ptr BigInt, a, b: BigInt, c: bool) {.global.} = + output[] = a + output[].ccopy(b, c) + + proc testCsetOne(output: ptr BigInt, a: BigInt, c: bool) {.global.} = + output[] = a + output[].cSetOne(c) + + proc testCsetZero(output: ptr BigInt, a, b: BigInt, c: uint32) {.global.} = + output[] = a + output[].cSetZero(c.bool) + + proc testCadd(output: ptr BigInt, a, b: BigInt, c: bool) {.global.} = + output[] = a + output[].cadd(b, c) + + proc testCsub(output: ptr BigInt, a, b: BigInt, c: bool) {.global.} = + output[] = a + output[].csub(b, c) + + proc testDouble(output: ptr BigInt, a: BigInt) {.global.} = + output[].doubleElement(a) + + proc testNsqr(output: ptr BigInt, a: BigInt, count: int) {.global.} = + output[].nsqr(a, count) + + proc testIsZero(output: ptr bool, a: BigInt) {.global.} = + output[].isZero(a) + + proc testIsOdd(output: ptr bool, a: BigInt) {.global.} = + output[].isOdd(a) + + proc testNeg(output: ptr BigInt, a: BigInt) {.global.} = + output[].neg(a) + + proc testCneg(output: ptr BigInt, a: BigInt, c: bool) {.global.} = + output[].cneg(a, c) + + proc testShiftRight(output: ptr BigInt, a: BigInt, k: uint32) {.global.} = + output[] = a + output[].shiftRight(k) + + proc testDiv2(output: ptr BigInt, a: BigInt) {.global.} = + output[] = a + output[].div2() + +from std / sequtils import mapIt +proc main = + var nvrtc = initNvrtc(BigIntExample) + # echo the generated CUDA code + echo BigIntExample + writeFile("/tmp/kernel.cu", BigIntExample) + + nvrtc.compile() + nvrtc.getPtx() + + var hOut: array[8, uint32] # storage for the output limbs (could also be a `Fp[BN254_Snarks]` instead) + nvrtc.numBlocks = 1 + nvrtc.threadsPerBlock = 1 + + block PassBigInt: + hOut.reset() + nvrtc.execute("testPassBigInt", (hOut), ()) + let exp = [0'u32, 1, 2, 3, 4, 5, 6, 7] + for i in 0 ..< 8: + doAssert exp[i] == hOut[i] + + block SetZero: + # now use `setZero` to reset to zero + # `hOut` should be zero, but let's change some numbers + hOut.reset() + hOut[0] = 123 + hOut[5] = 321 + nvrtc.execute("testSetZero", (hOut), ()) + let exp = [0'u32, 0, 0, 0, 0, 0, 0, 0] + for i in 0 ..< 8: + doAssert exp[i] == hOut[i] + + block SetOne: + # and `setOne` to set to Montgomery representation of 1 + hOut.reset() + nvrtc.execute("testSetOne", (hOut), ()) + let expFp = T.fromUInt(1'u32) + doAssert bool(expFp == toFp(T, hOut)) + + block Add: + # add one and one + hOut.reset() + let inFp = T.fromUInt(1'u32) + nvrtc.execute("testAdd", (hOut), (inFp, inFp)) # inputs 1 and 1 + let expFp = T.fromUInt(2'u32) + doAssert bool(expFp == toFp(T, hOut)) + + block Sub: + hOut.reset() + let inFp = T.fromUInt(1'u32) + nvrtc.execute("testSub", (hOut), (inFp, inFp)) # inputs 1 and 1 + let expFp1 = T.fromUInt(0'u32) + doAssert bool(expFp1 == toFp(T, hOut)) + + hOut.reset() + let inFp1 = T.fromUInt(5'u32) + let inFp2 = T.fromUInt(2'u32) + nvrtc.execute("testSub", (hOut), (inFp1, inFp2)) # inputs 1 and 1 + let expFp2 = T.fromUInt(3'u32) + doAssert bool(expFp2 == toFp(T, hOut)) + + block Mul: + # mul 2 and 2 + hOut.reset() + let inFp = T.fromUInt(2'u32) + nvrtc.execute("testMul", (hOut), (inFp, inFp)) # inputs 2 and 2 + let expFp = T.fromUInt(4'u32) + doAssert bool(expFp == toFp(T, hOut)) + + block Ccopy: + # ccopy based on true (false means we do not copy b into a) + hOut.reset() + let one = T.fromUInt(1'u32) + let two = T.fromUInt(2'u32) + nvrtc.execute("testCcopy", (hOut), (one, two, false)) # inputs 1 and 2 + doAssert bool(one == toFp(T, hOut)) + # and based on false (true means we do copy b into a) + nvrtc.execute("testCcopy", (hOut), (one, two, true)) # inputs 1 and 2 + doAssert bool(two == toFp(T, hOut)) + + block CsetZero: + hOut.reset() + hOut[0] = 123 + hOut[5] = 321 + var input: array[8, uint32] + input = hOut + nvrtc.execute("testCsetZero", (hOut), (input, input, 0'u32)) + let expF = [123'u32, 0, 0, 0, 0, 321, 0, 0] + for i in 0 ..< 8: + doAssert expF[i] == hOut[i] + + nvrtc.execute("testCsetZero", (hOut), (input, 1'u32)) + let expT = [0'u32, 0, 0, 0, 0, 0, 0, 0] + for i in 0 ..< 8: + doAssert expT[i] == hOut[i] + + block CsetOne: + hOut.reset() + hOut[0] = 123 + hOut[5] = 321 + var input: array[8, uint32] + input = hOut + nvrtc.execute("testCsetOne", (hOut), (input, false)) + let expF = [123'u32, 0, 0, 0, 0, 321, 0, 0] + for i in 0 ..< 8: + doAssert expF[i] == hOut[i] + + nvrtc.execute("testCsetOne", (hOut), (input, true)) + let expT = T.fromUInt(1'u32) + doAssert bool(expT == toFp(T, hOut)) + + block Cadd: + # add one and one + hOut.reset() + let inFp = T.fromUInt(1'u32) + nvrtc.execute("testCadd", (hOut), (inFp, inFp, false)) # inputs 1 and 1 + let expF = T.fromUInt(1'u32) + doAssert bool(expF == toFp(T, hOut)) + + hOut.reset() + nvrtc.execute("testCadd", (hOut), (inFp, inFp, true)) # inputs 1 and 1 + let expT = T.fromUInt(2'u32) + doAssert bool(expT == toFp(T, hOut)) + + block Csub: + hOut.reset() + # add one and one + let inFp1 = T.fromUInt(5'u32) + let inFp2 = T.fromUInt(2'u32) + nvrtc.execute("testCsub", (hOut), (inFp1, inFp2, false)) + let expF = T.fromUInt(5'u32) + doAssert bool(expF == toFp(T, hOut)) + + hOut.reset() + nvrtc.execute("testCsub", (hOut), (inFp1, inFp2, true)) + let expT = T.fromUInt(3'u32) + doAssert bool(expT == toFp(T, hOut)) + + block DoubleElement: + # add one and one + hOut.reset() + let inFp = T.fromUInt(6'u32) + nvrtc.execute("testDouble", (hOut), (inFp)) + let exp = T.fromUInt(12'u32) + doAssert bool(exp == toFp(T, hOut)) + + block Nsqr: + # add one and one + hOut.reset() + let inFp = T.fromUInt(2'u32) + nvrtc.execute("testNsqr", (hOut), (inFp, 2)) + let exp1 = T.fromUInt(16'u32) + doAssert bool(exp1 == toFp(T, hOut)) + + hOut.reset() + nvrtc.execute("testNsqr", (hOut), (inFp, 4)) + let exp2 = T.fromUInt(65536'u32) + doAssert bool(exp2 == toFp(T, hOut)) + + block IsZero: + hOut.reset() + let inFp1 = T.fromUInt(132'u32) + var res: bool + nvrtc.execute("testIsZero", (res), (inFp1)) + doAssert res == false + + hOut.reset() + let inFp2 = T.fromUInt(0'u32) + nvrtc.execute("testIsZero", (res), (inFp2)) + doAssert res == true + + hOut.reset() + var inFp3: array[8, uint32] # zero initialized + nvrtc.execute("testIsZero", (res), (inFp3)) + doAssert res == true + + block IsOdd: + hOut.reset() + var inp: array[8, uint32] + inp[0] = 2 # even + var res: bool + nvrtc.execute("testIsOdd", (res), (inp)) + doAssert res == false + + hOut.reset() + inp[0] = 0 # even + inp[5] = 555 + nvrtc.execute("testIsOdd", (res), (inp)) + doAssert res == false + + hOut.reset() + inp[0] = 123 # odd + nvrtc.execute("testIsOdd", (res), (inp)) + doAssert res == true + + block Neg: + hOut.reset() + let inFp1 = T.fromUInt(2'u32) + nvrtc.execute("testNeg", (hOut), (inFp1)) + var exp = inFp1 + exp.neg() + doAssert bool(exp == toFp(T, hOut)) + + hOut.reset() + let inFp2 = T.fromUInt(123547'u32) + nvrtc.execute("testNeg", (hOut), (inFp2)) + exp = inFp2 + exp.neg() + doAssert bool(exp == toFp(T, hOut)) + + block CNeg: + hOut.reset() + let inFp1 = T.fromUInt(2'u32) + nvrtc.execute("testCneg", (hOut), (inFp1, false)) + doAssert bool(inFp1 == toFp(T, hOut)) + + hOut.reset() + nvrtc.execute("testCneg", (hOut), (inFp1, true)) + var exp1 = inFp1 + exp1.neg() + doAssert bool(exp1 == toFp(T, hOut)) + + hOut.reset() + let inFp2 = T.fromUInt(123547'u32) + nvrtc.execute("testCneg", (hOut), (inFp2, false)) + doAssert bool(inFp2 == toFp(T, hOut)) + + hOut.reset() + nvrtc.execute("testCneg", (hOut), (inFp2, true)) + var exp2 = inFp2 + exp2.neg() + doAssert bool(exp2 == toFp(T, hOut)) + + block ShiftRight: + #let inFp1 = T.fromUInt(8'u32) + hOut.reset() + var inp: array[8, uint32] + inp[0] = 8 + nvrtc.execute("testShiftRight", (hOut), (inp, 2)) + var exp: array[8, uint32] + exp[0] = 2 + doAssert exp == hOut + + hOut.reset() + nvrtc.execute("testShiftRight", (hOut), (inp, 3)) + exp[0] = 1 + doAssert exp == hOut + + hOut.reset() + inp[0] = 15 + nvrtc.execute("testShiftRight", (hOut), (inp, 1)) + var exp3 = matchingBigInt(T.Name).fromUInt(15'u32) + exp3.shiftRight(1) + doAssert exp3.limbs.mapIt(it.uint32) == hOut.mapIt(it.uint32) + + block Div2: + hOut.reset() + let inFp1 = T.fromUInt(8'u32) + nvrtc.execute("testDiv2", (hOut), (inFp1)) + let exp1 = T.fromUInt(4'u32) + var expAF = inFp1 + expAF.div2() + doAssert bool(exp1 == toFp(T, hOut)) + + hOut.reset() + let inFp2 = T.fromUInt(4096'u32) + nvrtc.execute("testDiv2", (hOut), (inFp2)) + let exp2 = T.fromUInt(2048'u32) + doAssert bool(exp2 == toFp(T, hOut)) + + hOut.reset() + let inFp3 = T.fromUInt(15'u32) + nvrtc.execute("testDiv2", (hOut), (inFp3)) + var exp3 = T.fromUInt(15'u32) + exp3.div2() + doAssert bool(exp3 == toFp(T, hOut)) + + +when isMainModule: + main() diff --git a/tests/gpu/t_nvrtc_pass_pointer.nim b/tests/gpu/t_nvrtc_pass_pointer.nim new file mode 100644 index 000000000..8a0e8dc57 --- /dev/null +++ b/tests/gpu/t_nvrtc_pass_pointer.nim @@ -0,0 +1,57 @@ +import std / strformat + +import + # Internal + constantine/named/algebras, + constantine/math/io/[io_bigints, io_fields], + constantine/math/arithmetic, + constantine/platforms/abstractions {.all.}, + constantine/platforms/abis/nvidia_abi, + constantine/math_compiler/experimental/runtime_compile, + constantine/serialization/io_limbs + +import constantine/math_compiler/experimental/nvrtc_field_ops + +const N = 8 +type T = Fp[BN254_Snarks] +const WordSize = 32 +const BigIntExample* = cuda: + + defBigInt(N) + defPtxHelpers() + defCoreFieldOps() + + proc testPointer(x: ptr BigInt) {.device.} = + ## just write data to the bigint + for i in 0 ..< 8: + x[][i] = i.uint32 + + proc test(output: ptr UncheckedArray[uint32]) {.global.} = + # Call `modadd` and assign to result + + var t = BigInt() + + testPointer(t.addr) + + for i in 0 ..< 8: + output[i] = t[i] + +proc main = + var nvrtc = initNvrtc(BigIntExample) + # echo the generated CUDA code + echo BigIntExample + + nvrtc.compile() + nvrtc.getPtx() + + var hOut: array[8, uint32] # storage for the output limbs (could also be a `Fp[BN254_Snarks]` instead) + nvrtc.numBlocks = 1 + nvrtc.threadsPerBlock = 1 + + nvrtc.execute("test", (hOut), ()) + for i in 0 ..< 8: + echo hOut[i] + + +when isMainModule: + main() diff --git a/tests/gpu/t_nvrtc_pass_var.nim b/tests/gpu/t_nvrtc_pass_var.nim new file mode 100644 index 000000000..a7e0dcfd6 --- /dev/null +++ b/tests/gpu/t_nvrtc_pass_var.nim @@ -0,0 +1,57 @@ +import std / strformat + +import + # Internal + constantine/named/algebras, + constantine/math/io/[io_bigints, io_fields], + constantine/math/arithmetic, + constantine/platforms/abstractions {.all.}, + constantine/platforms/abis/nvidia_abi, + constantine/math_compiler/experimental/runtime_compile, + constantine/serialization/io_limbs + +import constantine/math_compiler/experimental/nvrtc_field_ops + +const N = 8 +type T = Fp[BN254_Snarks] +const WordSize = 32 +const BigIntExample* = cuda: + + defBigInt(N) + defPtxHelpers() + defCoreFieldOps() + + proc testPointer(x: var BigInt) {.device.} = + ## just write data to the bigint + for i in 0 ..< 8: + x[i] = i.uint32 + + proc test(output: ptr UncheckedArray[uint32]) {.global.} = + # Call `modadd` and assign to result + + var t = BigInt() + + testPointer(t) + + for i in 0 ..< 8: + output[i] = t[i] + +proc main = + var nvrtc = initNvrtc(BigIntExample) + # echo the generated CUDA code + echo BigIntExample + + nvrtc.compile() + nvrtc.getPtx() + + var hOut: array[8, uint32] # storage for the output limbs (could also be a `Fp[BN254_Snarks]` instead) + nvrtc.numBlocks = 1 + nvrtc.threadsPerBlock = 1 + + nvrtc.execute("test", (hOut), ()) + for i in 0 ..< 8: + echo hOut[i] + + +when isMainModule: + main()