Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
6b98782
commit the initial Nim ⇒ CUDA DSL, NVRTC & CUDA execution helpers
Vindaar Nov 29, 2024
353fdb0
[tests/examples] add example for a BigInt modular addition
Vindaar Nov 29, 2024
30f26d5
remove `nimcuda` dependency, wrap everything we need manually
Vindaar Feb 18, 2025
c5d0fc7
merge `execCuda` logic for LLVM & NVRTC
Vindaar Feb 18, 2025
690cdf2
add `quitOnFailure` for the `check` calls to avoid regression
Vindaar Feb 18, 2025
b1af33d
[tests] turn big int `modadd` example into real test
Vindaar Feb 18, 2025
f015584
copy `libpaths.nim` over from nimcuda
Vindaar Feb 19, 2025
f75343a
[cuda] add partial support for `const` in CUDA generator
Vindaar Feb 20, 2025
2b16798
[cuda] fix minor issue in `if` statements in CUDA generator
Vindaar Feb 20, 2025
f91f3f2
[cuda] add support for named blocks
Vindaar Feb 20, 2025
b315729
[cuda] add support for `bool`
Vindaar Feb 20, 2025
2de8850
[cuda] remove unnecessary semicolon
Vindaar Feb 20, 2025
4fbe3ba
[cuda] add support for `{.volatile.}` variables
Vindaar Feb 20, 2025
9e49246
[nvrtc] add `modadd`, `modsub`, `mtymul` implementations using inline…
Vindaar Feb 20, 2025
f37fa53
[cuda] support basic type conversions
Vindaar Feb 21, 2025
46016a9
[cuda] support `var` parameters in procs
Vindaar Feb 21, 2025
c1a1efb
[cuda] make sure proc body is a block
Vindaar Feb 21, 2025
675284d
[cuda] support boolean / bitwise AND/OR and XOR, NOT
Vindaar Feb 21, 2025
851fcc1
[cuda] support int32 literals
Vindaar Feb 21, 2025
b264d8a
[cuda] handle prefix `not`
Vindaar Feb 21, 2025
5f5995c
[cuda] make sure to pass `array` types by pointer instead of copy
Vindaar Feb 21, 2025
fbb43ef
[nvrtc] add more helpers, add TODO to investigate `slct` calls
Vindaar Feb 21, 2025
2f68999
[nvrtc] add many more field arithmetic / bigint operations
Vindaar Feb 21, 2025
3e6a877
[tests] add test to pass by pointer and `var`
Vindaar Feb 21, 2025
9d6d745
[tests] add test case for modadd/sub/mtymul
Vindaar Feb 21, 2025
bbd63d4
[tests] add basic test cases for all new NVRTC operations
Vindaar Feb 21, 2025
f40ceee
[tests] update modadd/sub, mtymul test for new `getFieldModulus`
Vindaar Feb 21, 2025
cbc0d32
add BabyBear field
Vindaar Feb 24, 2025
d3c1cce
[nvrtc] handle `mtymul` for fields with 1 limb
Vindaar Feb 24, 2025
1691d35
[cuda] support nested array types, unpack generic instantiatons
Vindaar Feb 24, 2025
92d8f69
[cuda] correctly generate ptr to array & ptr to array return types
Vindaar Feb 24, 2025
0f88289
[cuda] automatically generate `memcpy` for static array types
Vindaar Feb 25, 2025
64c9a75
[nvrtc] use `const` for field modulus and other CT constants
Vindaar Feb 26, 2025
6723373
[cuda] extend error message for non copyable inputs
Vindaar Feb 26, 2025
10e060f
[cuda] disable passStructByPointer also for CUDA
Vindaar Feb 26, 2025
9a1d441
[cuda] add `{.nimonly.}` pragma one can use in `cuda` block
Vindaar Feb 26, 2025
027a1f5
[cuda] better logic for detection of type names
Vindaar Feb 26, 2025
5a50d84
[cuda] support `const` by mapping it to a `__constant__`
Vindaar Feb 26, 2025
169a5ad
[cuda] allow type determination from array literal
Vindaar Feb 26, 2025
7d7c61e
[nvrtc] get rid of complexity with custom uint32 constants
Vindaar Feb 26, 2025
d9817b9
[staticFor] add stepped variant of `staticFor`
Vindaar Feb 26, 2025
93242b2
[cuda] support func, discard and command nnkCommand
Vindaar Feb 27, 2025
52cd8cc
[cuda] better handle required semicolons
Vindaar Feb 27, 2025
1249bbc
[cuda] extract type from `getType` for execution helper
Vindaar Mar 3, 2025
6c9faf7
[cuda] special case `CUdeviceptr` as a type that *must not* be copied
Vindaar Mar 3, 2025
a0d15d6
[cuda] allow passing in shared memory size for a kernel
Vindaar Mar 3, 2025
843b497
[nvidia ABI] wrap cuModuleGetGlobal, cudaMemcpyKind and a couple more
Vindaar Mar 3, 2025
995b009
[cuda] support while loops
Vindaar Mar 3, 2025
f045af8
[cuda] support void pointers and `nil` literals
Vindaar Mar 3, 2025
952d1b8
[cuda] refactor out module loading from execution
Vindaar Mar 3, 2025
39de985
[cuda] store PTX before echoeing it
Vindaar Mar 3, 2025
69621cc
[cuda] add `copyToSymbol` helper to copy to constant symbol in CUDA code
Vindaar Mar 3, 2025
0fafdcb
[cuda] generalize `volatile` annotation to support other pragmas
Vindaar Mar 3, 2025
e8a30a9
[cuda] `cudaName` pragma for custom name for a proc, eg __syncthreads
Vindaar Mar 3, 2025
b1db01a
[cuda] support float literals
Vindaar Mar 3, 2025
f5a97ca
[cuda] map arrays of explicit length 0 to `[]` arrays in CUDA
Vindaar Mar 3, 2025
cafe8fe
[cuda] explicitly support constants, mapped to `__constant__`
Vindaar Mar 3, 2025
d30ade3
[cuda] minor cleanup
Vindaar Mar 3, 2025
5604027
[cuda] add `gridDim`, `cuExtern` and `share` + device malloc/free
Vindaar Mar 3, 2025
ee46b36
force compilation with `-d:CTT_32` for the moment
Vindaar Mar 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
315 changes: 26 additions & 289 deletions constantine/math_compiler/codegen_nvidia.nim
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,6 @@ export
# Cuda Driver API
# ------------------------------------------------------------

template check*(status: CUresult, quitOnFailure = true) =
## Check the status code of a CUDA operation
## Exit program with error if failure

let code = status # ensure that the input expression is evaluated once only

if code != CUDA_SUCCESS:
writeStackTrace()
stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
if quitOnFailure:
quit 1 # NOTE: this hides exceptions if they are thrown!

func cuModuleLoadData*(module: var CUmodule, sourceCode: openArray[char]): CUresult {.inline.}=
cuModuleLoadData(module, sourceCode[0].unsafeAddr)
func cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: openArray[char]): CUresult {.inline.}=
Expand Down Expand Up @@ -221,277 +209,6 @@ proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =
check cuMemFree(aGPU)
check cuMemFree(bGPU)

proc getTypes(n: NimNode): seq[NimNode] =
case n.kind
of nnkIdent, nnkSym: result.add getTypeInst(n)
of nnkLiterals: result.add getTypeInst(n)
of nnkBracket, nnkTupleConstr, nnkPar:
for el in n:
result.add getTypes(el)
else:
case n.typeKind
of ntyPtr: result.add getTypeInst(n)
else:
error("Arguments to `execCuda` must be given as a bracket, tuple or typed expression. Instead: " & $n.treerepr)

proc requiresCopy(n: NimNode): bool =
## Returns `true` if the given type is not a trivial data type, which implies
## it will require copying its value manually.
case n.typeKind
of ntyBool, ntyChar, ntyInt .. ntyUint64: # range includes all floats
result = false
else:
result = true

proc allowsCopy(n: NimNode): bool =
## Returns `true` if the given type is allowed to be copied. That means it is
## either `requiresCopy` or a `var` symbol.
result = n.requiresCopy or n.symKind == nskVar

proc getIdent(n: NimNode): NimNode =
## Generate a `GPU` suffixed ident
# Note: We want a deterministic name, because we call `getIdent` for the same symbol
# in multiple places atm.
case n.kind
of nnkIdent, nnkSym: result = ident(n.strVal & "GPU")
else: result = ident("`" & n.repr & "`GPU")

proc determineDevicePtrs(r, i: NimNode, iTypes: seq[NimNode]): seq[(NimNode, NimNode)] =
## Returns the device pointer ident and its associated original symbol.
for el in r:
if not el.allowsCopy:
error("The argument for `res`: " & $el.repr & " of type: " & $el.getTypeImpl().treerepr &
" does not allow copying. Copying to the address of all result variables is required.")
result.add (getIdent(el), el)
for idx in 0 ..< i.len:
let input = i[idx]
let t = iTypes[idx]
if t.requiresCopy():
result.add (getIdent(input), input)

proc assembleParams(r, i: NimNode, iTypes: seq[NimNode]): seq[NimNode] =
## Returns all parameters. Depending on whether they require copies or
## are `res` parameters, either the input parameter or the `GPU` parameter.
for el in r: # for `res` we always copy!
result.add getIdent(el)
for idx in 0 ..< i.len:
let input = i[idx]
let t = iTypes[idx]
if t.requiresCopy():
result.add getIdent(input)
else:
result.add input

proc sizeArg(n: NimNode): NimNode =
## The argument to `sizeof` must be the size of the data we copy. If the
## input type is already given as a `ptr T` type, we need the size of
## `T` and not `ptr`.
case n.typeKind
of ntyPtr: result = n.getTypeInst()[0]
else: result = n

# little helper macro constructors
template check(arg): untyped = nnkCall.newTree(ident"check", arg)
template size(arg): untyped = nnkCall.newTree(ident"sizeof", sizeArg arg)
template address(arg): untyped = nnkCall.newTree(ident"addr", arg)
template csize_t(arg): untyped = nnkCall.newTree(ident"csize_t", arg)
template pointer(arg): untyped = nnkCall.newTree(ident"pointer", arg)

proc maybeAddress(n: NimNode): NimNode =
## Returns the address of the given node, *IFF* the type is not a
## pointer type already
case n.typeKind
of ntyPtr: result = n
else: result = address(n)

proc genParams(pId, r, i: NimNode, iTypes: seq[NimNode]): NimNode =
## Generates the parameter `params` variable
let ps = assembleParams(r, i, iTypes)
result = nnkBracket.newTree()
for p in ps:
result.add pointer(maybeAddress p)
result = nnkLetSection.newTree(
nnkIdentDefs.newTree(pId, newEmptyNode(), result)
)

proc genVar(n: NimNode): (NimNode, NimNode) =
## Generates a let `tmp` variable and returns its identifier and
## the let section.
result[0] = genSym(nskLet, "tmp")
result[1] = nnkLetSection.newTree(
nnkIdentDefs.newTree(
result[0],
getTypeInst(n),
n
)
)

proc genLocalVars(inputs: NimNode): (NimNode, NimNode) =
result[0] = newStmtList() # defines local vars
result[1] = nnkBracket.newTree() # returns new bracket of vars for parameters
for el in inputs:
case el.kind
of nnkLiterals, nnkConstDef: # define a local with the value of it
let (s, v) = genVar(el)
result[0].add v
result[1].add s
of nnkSym:
if el.strVal in ["true", "false"]:
let (s, v) = genVar(el)
result[0].add v
result[1].add s
else:
result[1].add el # keep symbol
else:
result[1].add el # keep symbol

proc maybeWrap(n: NimNode): NimNode =
if n.kind notin {nnkBracket, nnkTupleConstr}:
result = nnkBracket.newTree(n)
else:
result = n

proc endianCheck(): NimNode =
result = quote do:
static: doAssert cpuEndian == littleEndian, block:
# From https://developer.nvidia.com/cuda-downloads?target_os=Linux
# Supported architectures for Cuda are:
# x86-64, PowerPC 64 little-endian, ARM64 (aarch64)
# which are all little-endian at word-level.
#
# Due to limbs being also stored in little-endian, on little-endian host
# the CPU and GPU will have the same binary representation
# whether we use 32-bit or 64-bit words, so naive memcpy can be used for parameter passing.

"Most CPUs (x86-64, ARM) are little-endian, as are Nvidia GPUs, which allows naive copying of parameters.\n" &
"Your architecture '" & $hostCPU & "' is big-endian and GPU offloading is unsupported on it."

proc execCudaImpl(jitFn, res, inputs: NimNode): NimNode =
# Maybe wrap individually given arguments in a `[]` bracket, e.g.
# `execCuda(res = foo, inputs = bar)`
let res = maybeWrap res
let inputs = maybeWrap inputs

result = newStmtList()
result.add endianCheck()

# get the types of the inputs
let rTypes = getTypes(res)
let iTypes = getTypes(inputs)

# determine all required `CUdeviceptr`
let devPtrs = determineDevicePtrs(res, inputs, iTypes)

# generate device pointers, allocate memory and copy data
for x in devPtrs:
# `var rGPU: CUdeviceptr`
result.add nnkVarSection.newTree(
nnkIdentDefs.newTree(
x[0],
ident"CUdeviceptr",
newEmptyNode()
)
)

# `check cuMemAlloc(rGPU, csize_t sizeof(r))`
result.add(
check nnkCall.newTree(
ident"cuMemAlloc",
x[0],
csize_t size(x[1])
)
)
# `check cuMemcpyHtoD(aGPU, a.addr, csize_t sizeof(a))`
result.add(
check nnkCall.newTree(
ident"cuMemcpyHtoD",
x[0],
maybeAddress x[1],
csize_t size(x[1])
)
)

# Generate local variables
let (decl, vars) = genLocalVars(inputs)
result.add decl

# assemble the parameters
let pId = ident"params"
let params = genParams(pId, res, vars, iTypes)
result.add params

# launch the kernel
result.add quote do:
check cuLaunchKernel(
`jitFn`,
1, 1, 1, # grid(x, y, z)
1, 1, 1, # block(x, y, z)
sharedMemBytes = 0,
CUstream(nil),
`pId`[0].unsafeAddr, nil)

# copy back results
let devPtrsRes = determineDevicePtrs(res, nnkBracket.newTree(), @[])
for x in devPtrsRes:
result.add(
check nnkCall.newTree(
ident"cuMemcpyDtoH",
maybeAddress x[1],
x[0],
csize_t size(x[1])
)
)

# free memory
for x in devPtrs:
result.add(
check nnkCall.newTree(
ident"cuMemFree",
x[0]
)
)
result = quote do:
block:
`result`

macro execCuda*(jitFn: CUfunction,
res: typed,
inputs: typed): untyped =
## Given a CUDA function, execute the kernel. Copies all non trivial data types to
## to the GPU via `cuMemcpyHtoD`. Any argument given as `res` will be copied back
## from the GPU after kernel execution finishes.
##
## IMPORTANT:
## The arguments passed to the CUDA kernel will be in the order in which they are
## given to the macro. This especially means `res` arguments will be passed first.
##
## Example:
## ```nim
## execCuda(fn, res = [r, s], inputs = [a, b, c]) # if all arguments have the same type
## # or
## execCuda(fn, res = (r, s), inputs = (a, b, c)) # if different types
## ```
## will pass the parameters as `[r, s, a, b, c]`.
##
## For more examples see the test case `tests/gpu/t_exec_literals_consts.nim`.
##
## We do not perform any checks on whether the given types are valid as arguments to
## the CUDA target! Also, all arguments given as `res` are expected to be copied.
## To return a value for a simple data type, use a `ptr X` type. However, it is allowed
## to simply pass a `var` symbol as a `res` argument. We automatically copy to the
## the memory location.
##
## We also copy all `res` data to the GPU, so that a return value can also be used
## as an input.
##
## NOTE: This function is mainly intended for convenient execution of a single kernel
result = execCudaImpl(jitFn, res, inputs)

macro execCuda*(jitFn: CUfunction,
res: typed): untyped =
## Overload of the above for empty `inputs`
result = execCudaImpl(jitFn, res, nnkBracket.newTree())

# ############################################################
#
# Compilation helper
Expand All @@ -516,6 +233,12 @@ type

NvidiaAssembler* = ref NvidiaAssemblerObj

## We define a distinct version of the `CUfunction` type to differentiate
## producing a kernel via the LLVM backend from the more direct approach
## using NVRTC. This is because the data passing for field elements
## is more complicated on the LLVM side (requires a manual copy).
CUfunctionLLVM* = distinct CUfunction

proc `=destroy`*(nv: NvidiaAssemblerObj) =
## XXX: Need to also call the finalizer for `asy` in the future!
# NOTE: In the destructor we don't want to quit on a `check` failure.
Expand Down Expand Up @@ -592,7 +315,7 @@ proc initNvAsm*[Name: static Algebra](field: type EC_ShortW_Jac[Fp[Name], G1], w
result.fd = result.cd.fd
result.asy.definePrimitives(result.cd)

proc compile*(nv: NvidiaAssembler, kernName: string): CUfunction =
proc compile*(nv: NvidiaAssembler, kernName: string): CUfunctionLLVM =
## Overload of `compile` below.
## Call this version if you have manually used the Assembler_LLVM object
## to build instructions and have a kernel name you wish to compile.
Expand All @@ -617,18 +340,32 @@ proc compile*(nv: NvidiaAssembler, kernName: string): CUfunction =
check cuModuleLoadData(nv.cuMod, ptx)
# will be cleaned up when `NvidiaAssembler` goes out of scope

result = nv.cuMod.getCudaKernel(kernName)
result = CUfunctionLLVM(nv.cuMod.getCudaKernel(kernName))

proc compile*(nv: NvidiaAssembler, fn: FieldFnGenerator): CUfunction =
proc compile*(nv: NvidiaAssembler, fn: FieldFnGenerator): CUfunctionLLVM =
## Given a function that generates code for a finite field operation, compile
## that function on the given Nvidia target and return a CUDA function.
# execute the `fn`
let kernName = nv.asy.fn(nv.fd)
result = nv.compile(kernName)
result = CUfunctionLLVM(nv.compile(kernName))

proc compile*(nv: NvidiaAssembler, fn: CurveFnGenerator): CUfunction =
proc compile*(nv: NvidiaAssembler, fn: CurveFnGenerator): CUfunctionLLVM =
## Given a function that generates code for an elliptic curve operation, compile
## that function on the given Nvidia target and return a CUDA function.
# execute the `fn`
let kernName = nv.asy.fn(nv.cd)
result = nv.compile(kernName)
result = CUfunctionLLVM(nv.compile(kernName))

import ./experimental/cuda_execute_dsl
macro execCuda*(jitFn: CUfunctionLLVM,
res: typed,
inputs: typed): untyped =
## See `execCuda` in `constantine/math_compiler/experimental/cuda_execute_dsl.nim`
## for an explanation.
##
## This LLVM overload makes sure we disallow passing simple structs
## via their pointer and instead always copy them (required due to our
## type definitions for finite field elements and elliptic curve points
## on the LLVM target).
execCudaImpl(jitFn, newLit 1, newLit 1, res, inputs,
passStructByPointer = false)
Loading
Loading