diff --git a/constantine/math_compiler/codegen_nvidia.nim b/constantine/math_compiler/codegen_nvidia.nim
index dbce048a3..da1034e75 100644
--- a/constantine/math_compiler/codegen_nvidia.nim
+++ b/constantine/math_compiler/codegen_nvidia.nim
@@ -71,18 +71,6 @@ export
 # Cuda Driver API
 # ------------------------------------------------------------
 
-template check*(status: CUresult, quitOnFailure = true) =
-  ## Check the status code of a CUDA operation
-  ## Exit program with error if failure
-
-  let code = status # ensure that the input expression is evaluated once only
-
-  if code != CUDA_SUCCESS:
-    writeStackTrace()
-    stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
-    if quitOnFailure:
-      quit 1 # NOTE: this hides exceptions if they are thrown!
-
 func cuModuleLoadData*(module: var CUmodule, sourceCode: openArray[char]): CUresult {.inline.}=
   cuModuleLoadData(module, sourceCode[0].unsafeAddr)
 func cuModuleGetFunction*(kernel: var CUfunction, module: CUmodule, fnName: openArray[char]): CUresult {.inline.}=
@@ -221,277 +209,6 @@ proc exec*[T](jitFn: CUfunction, r: var T, a, b: T) =
   check cuMemFree(aGPU)
   check cuMemFree(bGPU)
 
-proc getTypes(n: NimNode): seq[NimNode] =
-  case n.kind
-  of nnkIdent, nnkSym: result.add getTypeInst(n)
-  of nnkLiterals: result.add getTypeInst(n)
-  of nnkBracket, nnkTupleConstr, nnkPar:
-    for el in n:
-      result.add getTypes(el)
-  else:
-    case n.typeKind
-    of ntyPtr: result.add getTypeInst(n)
-    else:
-      error("Arguments to `execCuda` must be given as a bracket, tuple or typed expression. Instead: " & $n.treerepr)
-
-proc requiresCopy(n: NimNode): bool =
-  ## Returns `true` if the given type is not a trivial data type, which implies
-  ## it will require copying its value manually.
-  case n.typeKind
-  of ntyBool, ntyChar, ntyInt .. ntyUint64: # range includes all floats
-    result = false
-  else:
-    result = true
-
-proc allowsCopy(n: NimNode): bool =
-  ## Returns `true` if the given type is allowed to be copied. That means it is
-  ## either `requiresCopy` or a `var` symbol.
-  result = n.requiresCopy or n.symKind == nskVar
-
-proc getIdent(n: NimNode): NimNode =
-  ## Generate a `GPU` suffixed ident
-  # Note: We want a deterministic name, because we call `getIdent` for the same symbol
-  # in multiple places atm.
-  case n.kind
-  of nnkIdent, nnkSym: result = ident(n.strVal & "GPU")
-  else: result = ident("`" & n.repr & "`GPU")
-
-proc determineDevicePtrs(r, i: NimNode, iTypes: seq[NimNode]): seq[(NimNode, NimNode)] =
-  ## Returns the device pointer ident and its associated original symbol.
-  for el in r:
-    if not el.allowsCopy:
-      error("The argument for `res`: " & $el.repr & " of type: " & $el.getTypeImpl().treerepr &
-        " does not allow copying. Copying to the address of all result variables is required.")
-    result.add (getIdent(el), el)
-  for idx in 0 ..< i.len:
-    let input = i[idx]
-    let t = iTypes[idx]
-    if t.requiresCopy():
-      result.add (getIdent(input), input)
-
-proc assembleParams(r, i: NimNode, iTypes: seq[NimNode]): seq[NimNode] =
-  ## Returns all parameters. Depending on whether they require copies or
-  ## are `res` parameters, either the input parameter or the `GPU` parameter.
-  for el in r: # for `res` we always copy!
-    result.add getIdent(el)
-  for idx in 0 ..< i.len:
-    let input = i[idx]
-    let t = iTypes[idx]
-    if t.requiresCopy():
-      result.add getIdent(input)
-    else:
-      result.add input
-
-proc sizeArg(n: NimNode): NimNode =
-  ## The argument to `sizeof` must be the size of the data we copy. If the
-  ## input type is already given as a `ptr T` type, we need the size of
-  ## `T` and not `ptr`.
-  case n.typeKind
-  of ntyPtr: result = n.getTypeInst()[0]
-  else: result = n
-
-# little helper macro constructors
-template check(arg): untyped = nnkCall.newTree(ident"check", arg)
-template size(arg): untyped = nnkCall.newTree(ident"sizeof", sizeArg arg)
-template address(arg): untyped = nnkCall.newTree(ident"addr", arg)
-template csize_t(arg): untyped = nnkCall.newTree(ident"csize_t", arg)
-template pointer(arg): untyped = nnkCall.newTree(ident"pointer", arg)
-
-proc maybeAddress(n: NimNode): NimNode =
-  ## Returns the address of the given node, *IFF* the type is not a
-  ## pointer type already
-  case n.typeKind
-  of ntyPtr: result = n
-  else: result = address(n)
-
-proc genParams(pId, r, i: NimNode, iTypes: seq[NimNode]): NimNode =
-  ## Generates the parameter `params` variable
-  let ps = assembleParams(r, i, iTypes)
-  result = nnkBracket.newTree()
-  for p in ps:
-    result.add pointer(maybeAddress p)
-  result = nnkLetSection.newTree(
-    nnkIdentDefs.newTree(pId, newEmptyNode(), result)
-  )
-
-proc genVar(n: NimNode): (NimNode, NimNode) =
-  ## Generates a let `tmp` variable and returns its identifier and
-  ## the let section.
-  result[0] = genSym(nskLet, "tmp")
-  result[1] = nnkLetSection.newTree(
-    nnkIdentDefs.newTree(
-      result[0],
-      getTypeInst(n),
-      n
-    )
-  )
-
-proc genLocalVars(inputs: NimNode): (NimNode, NimNode) =
-  result[0] = newStmtList() # defines local vars
-  result[1] = nnkBracket.newTree() # returns new bracket of vars for parameters
-  for el in inputs:
-    case el.kind
-    of nnkLiterals, nnkConstDef: # define a local with the value of it
-      let (s, v) = genVar(el)
-      result[0].add v
-      result[1].add s
-    of nnkSym:
-      if el.strVal in ["true", "false"]:
-        let (s, v) = genVar(el)
-        result[0].add v
-        result[1].add s
-      else:
-        result[1].add el # keep symbol
-    else:
-      result[1].add el # keep symbol
-
-proc maybeWrap(n: NimNode): NimNode =
-  if n.kind notin {nnkBracket, nnkTupleConstr}:
-    result = nnkBracket.newTree(n)
-  else:
-    result = n
-
-proc endianCheck(): NimNode =
-  result = quote do:
-    static: doAssert cpuEndian == littleEndian, block:
-      # From https://developer.nvidia.com/cuda-downloads?target_os=Linux
-      # Supported architectures for Cuda are:
-      # x86-64, PowerPC 64 little-endian, ARM64 (aarch64)
-      # which are all little-endian at word-level.
-      #
-      # Due to limbs being also stored in little-endian, on little-endian host
-      # the CPU and GPU will have the same binary representation
-      # whether we use 32-bit or 64-bit words, so naive memcpy can be used for parameter passing.
-
-      "Most CPUs (x86-64, ARM) are little-endian, as are Nvidia GPUs, which allows naive copying of parameters.\n" &
-      "Your architecture '" & $hostCPU & "' is big-endian and GPU offloading is unsupported on it."
-
-proc execCudaImpl(jitFn, res, inputs: NimNode): NimNode =
-  # Maybe wrap individually given arguments in a `[]` bracket, e.g.
-  # `execCuda(res = foo, inputs = bar)`
-  let res = maybeWrap res
-  let inputs = maybeWrap inputs
-
-  result = newStmtList()
-  result.add endianCheck()
-
-  # get the types of the inputs
-  let rTypes = getTypes(res)
-  let iTypes = getTypes(inputs)
-
-  # determine all required `CUdeviceptr`
-  let devPtrs = determineDevicePtrs(res, inputs, iTypes)
-
-  # generate device pointers, allocate memory and copy data
-  for x in devPtrs:
-    # `var rGPU: CUdeviceptr`
-    result.add nnkVarSection.newTree(
-      nnkIdentDefs.newTree(
-        x[0],
-        ident"CUdeviceptr",
-        newEmptyNode()
-      )
-    )
-
-    # `check cuMemAlloc(rGPU, csize_t sizeof(r))`
-    result.add(
-      check nnkCall.newTree(
-        ident"cuMemAlloc",
-        x[0],
-        csize_t size(x[1])
-      )
-    )
-    # `check cuMemcpyHtoD(aGPU, a.addr, csize_t sizeof(a))`
-    result.add(
-      check nnkCall.newTree(
-        ident"cuMemcpyHtoD",
-        x[0],
-        maybeAddress x[1],
-        csize_t size(x[1])
-      )
-    )
-
-  # Generate local variables
-  let (decl, vars) = genLocalVars(inputs)
-  result.add decl
-
-  # assemble the parameters
-  let pId = ident"params"
-  let params = genParams(pId, res, vars, iTypes)
-  result.add params
-
-  # launch the kernel
-  result.add quote do:
-    check cuLaunchKernel(
-            `jitFn`,
-            1, 1, 1, # grid(x, y, z)
-            1, 1, 1, # block(x, y, z)
-            sharedMemBytes = 0,
-            CUstream(nil),
-      `pId`[0].unsafeAddr, nil)
-
-  # copy back results
-  let devPtrsRes = determineDevicePtrs(res, nnkBracket.newTree(), @[])
-  for x in devPtrsRes:
-    result.add(
-      check nnkCall.newTree(
-        ident"cuMemcpyDtoH",
-        maybeAddress x[1],
-        x[0],
-        csize_t size(x[1])
-      )
-    )
-
-  # free memory
-  for x in devPtrs:
-    result.add(
-      check nnkCall.newTree(
-        ident"cuMemFree",
-        x[0]
-      )
-    )
-  result = quote do:
-    block:
-      `result`
-
-macro execCuda*(jitFn: CUfunction,
-                res: typed,
-                inputs: typed): untyped =
-  ## Given a CUDA function, execute the kernel. Copies all non trivial data types to
-  ## to the GPU via `cuMemcpyHtoD`. Any argument given as `res` will be copied back
-  ## from the GPU after kernel execution finishes.
-  ##
-  ## IMPORTANT:
-  ## The arguments passed to the CUDA kernel will be in the order in which they are
-  ## given to the macro. This especially means `res` arguments will be passed first.
-  ##
-  ## Example:
-  ## ```nim
-  ## execCuda(fn, res = [r, s], inputs = [a, b, c]) # if all arguments have the same type
-  ## # or
-  ## execCuda(fn, res = (r, s), inputs = (a, b, c)) # if different types
-  ## ```
-  ## will pass the parameters as `[r, s, a, b, c]`.
-  ##
-  ## For more examples see the test case `tests/gpu/t_exec_literals_consts.nim`.
-  ##
-  ## We do not perform any checks on whether the given types are valid as arguments to
-  ## the CUDA target! Also, all arguments given as `res` are expected to be copied.
-  ## To return a value for a simple data type, use a `ptr X` type. However, it is allowed
-  ## to simply pass a `var` symbol as a `res` argument. We automatically copy to the
-  ## the memory location.
-  ##
-  ## We also copy all `res` data to the GPU, so that a return value can also be used
-  ## as an input.
-  ##
-  ## NOTE: This function is mainly intended for convenient execution of a single kernel
-  result = execCudaImpl(jitFn, res, inputs)
-
-macro execCuda*(jitFn: CUfunction,
-                res: typed): untyped =
-  ## Overload of the above for empty `inputs`
-  result = execCudaImpl(jitFn, res, nnkBracket.newTree())
-
 # ############################################################
 #
 #                   Compilation helper
@@ -516,6 +233,12 @@ type
 
   NvidiaAssembler* = ref NvidiaAssemblerObj
 
+  ## We define a distinct version of the `CUfunction` type to differentiate
+  ## producing a kernel via the LLVM backend from the more direct approach
+  ## using NVRTC. This is because the data passing for field elements
+  ## is more complicated on the LLVM side (requires a manual copy).
+  CUfunctionLLVM* = distinct CUfunction
+
 proc `=destroy`*(nv: NvidiaAssemblerObj) =
   ## XXX: Need to also call the finalizer for `asy` in the future!
   # NOTE: In the destructor we don't want to quit on a `check` failure.
@@ -592,7 +315,7 @@ proc initNvAsm*[Name: static Algebra](field: type EC_ShortW_Jac[Fp[Name], G1], w
   result.fd = result.cd.fd
   result.asy.definePrimitives(result.cd)
 
-proc compile*(nv: NvidiaAssembler, kernName: string): CUfunction =
+proc compile*(nv: NvidiaAssembler, kernName: string): CUfunctionLLVM =
   ## Overload of `compile` below.
   ## Call this version if you have manually used the Assembler_LLVM object
   ## to build instructions and have a kernel name you wish to compile.
@@ -617,18 +340,32 @@ proc compile*(nv: NvidiaAssembler, kernName: string): CUfunction =
   check cuModuleLoadData(nv.cuMod, ptx)
   # will be cleaned up when `NvidiaAssembler` goes out of scope
 
-  result = nv.cuMod.getCudaKernel(kernName)
+  result = CUfunctionLLVM(nv.cuMod.getCudaKernel(kernName))
 
-proc compile*(nv: NvidiaAssembler, fn: FieldFnGenerator): CUfunction =
+proc compile*(nv: NvidiaAssembler, fn: FieldFnGenerator): CUfunctionLLVM =
   ## Given a function that generates code for a finite field operation, compile
   ## that function on the given Nvidia target and return a CUDA function.
   # execute the `fn`
   let kernName = nv.asy.fn(nv.fd)
-  result = nv.compile(kernName)
+  result = CUfunctionLLVM(nv.compile(kernName))
 
-proc compile*(nv: NvidiaAssembler, fn: CurveFnGenerator): CUfunction =
+proc compile*(nv: NvidiaAssembler, fn: CurveFnGenerator): CUfunctionLLVM =
   ## Given a function that generates code for an elliptic curve operation, compile
   ## that function on the given Nvidia target and return a CUDA function.
   # execute the `fn`
   let kernName = nv.asy.fn(nv.cd)
-  result = nv.compile(kernName)
+  result = CUfunctionLLVM(nv.compile(kernName))
+
+import ./experimental/cuda_execute_dsl
+macro execCuda*(jitFn: CUfunctionLLVM,
+                res: typed,
+                inputs: typed): untyped =
+  ## See `execCuda` in `constantine/math_compiler/experimental/cuda_execute_dsl.nim`
+  ## for an explanation.
+  ##
+  ## This LLVM overload makes sure we disallow passing simple structs
+  ## via their pointer and instead always copy them (required due to our
+  ## type definitions for finite field elements and elliptic curve points
+  ## on the LLVM target).
+  execCudaImpl(jitFn, newLit 1, newLit 1, res, inputs,
+               passStructByPointer = false)
diff --git a/constantine/math_compiler/experimental/cuda_execute_dsl.nim b/constantine/math_compiler/experimental/cuda_execute_dsl.nim
new file mode 100644
index 000000000..d6618b386
--- /dev/null
+++ b/constantine/math_compiler/experimental/cuda_execute_dsl.nim
@@ -0,0 +1,356 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import constantine/platforms/abis/nvidia_abi
+import std/macros
+from std / strutils import normalize
+
+proc getTypes(n: NimNode): seq[NimNode] =
+  case n.kind
+  of nnkIdent, nnkSym: result.add getTypeInst(n)
+  of nnkLiterals: result.add getTypeInst(n)
+  of nnkBracket, nnkTupleConstr, nnkPar:
+    for el in n:
+      result.add getTypes(el)
+  of nnkCall:
+    result.add getType(n)
+  else:
+    case n.typeKind
+    of ntyPtr: result.add getTypeInst(n)
+    else:
+      error("Arguments to `execCuda` must be given as a bracket, tuple or typed expression. Instead: " & $n.treerepr)
+
+proc requiresCopy(n: NimNode, passStructByPointer: bool): bool =
+  ## Returns `true` if the given type is not a trivial data type, which implies
+  ## it will require copying its value manually.
+  ##
+  ## WARNING: For the moment we determine if something needs to be copied especially
+  ## based on whether it is an object or ref type. That means *DO NOT* nest ref
+  ## types in your objects. They *WILL NOT* be deep copied!
+  ##
+  ## If `passStructByPointer` is `true` we do *not* copy trivial struct types, e.g. a big int
+  ## or finite field element. If it is false, we always copy for those. The distinction
+  ## is needed, because for the CUDA target via LLVM, the array type definitions cause
+  ## `cudaErrorIllegalAddress` if we directly pass the host pointer of the struct.
+  case n.typeKind
+  of ntyBool, ntyChar, ntyInt .. ntyUint64: # range includes all floats
+    result = false
+  of ntyObject, ntyArray:
+    if passStructByPointer:
+      result = false # regular objects can just be copied!
+    else:
+      result = true # struct passing by pointer forbidden
+    ## NOTE: strictly speaking this is not the case of course! If the object
+    ## contains refs, it won't hold!
+  of ntyGenericInst:
+    if passStructByPointer:
+      let impl = n.getTypeImpl()
+      result = impl.kind == nnkRefTy # if a ref, needs to be copied
+    else:
+      result = true # for now assume it needs to be copied
+  of ntyDistinct:
+    let impl = n.getTypeInst()
+    if impl.kind in [nnkIdent, nnkSym] and impl.strVal.normalize == "cudeviceptr":
+      result = false
+    else:
+      result = true
+  else:
+    result = true
+
+proc allowsCopy(n: NimNode, passStructByPointer: bool): bool =
+  ## Returns `true` if the given type is allowed to be copied. That means it is
+  ## either `requiresCopy` or a `var` symbol.
+  result = n.requiresCopy(passStructByPointer) or n.symKind == nskVar
+
+proc getIdent(n: NimNode): NimNode =
+  ## Generate a `GPU` suffixed ident
+  # Note: We want a deterministic name, because we call `getIdent` for the same symbol
+  # in multiple places atm.
+  case n.kind
+  of nnkIdent, nnkSym: result = ident(n.strVal & "GPU")
+  else: result = ident("`" & n.repr & "`GPU")
+
+proc determineDevicePtrs(r, i: NimNode, iTypes: seq[NimNode],
+                         passStructByPointer: bool): seq[(NimNode, NimNode)] =
+  ## Returns the device pointer ident and its associated original symbol.
+  for el in r:
+    if not el.allowsCopy(passStructByPointer):
+      error("The argument for `res`: " & $el.repr & " of type: " & $el.getTypeImpl().treerepr &
+        " does not allow copying. Copying to the address of all result variables is required." &
+        " It is a `" & $el.symKind & "` variable, but needs to be a `var`.")
+    result.add (getIdent(el), el)
+  for idx in 0 ..< i.len:
+    let input = i[idx]
+    let t = iTypes[idx]
+    if t.requiresCopy(passStructByPointer):
+      result.add (getIdent(input), input)
+
+proc assembleParams(r, i: NimNode, iTypes: seq[NimNode], passStructByPointer: bool): seq[NimNode] =
+  ## Returns all parameters. Depending on whether they require copies or
+  ## are `res` parameters, either the input parameter or the `GPU` parameter.
+  for el in r: # for `res` we always copy!
+    result.add getIdent(el)
+  for idx in 0 ..< i.len:
+    let input = i[idx]
+    let t = iTypes[idx]
+    if t.requiresCopy(passStructByPointer):
+      result.add getIdent(input)
+    else:
+      result.add input
+
+# little helper macro constructors
+template check(arg): untyped = nnkCall.newTree(ident"check", arg)
+template size(arg): untyped = nnkCall.newTree(ident"sizeof", arg)
+template address(arg): untyped = nnkCall.newTree(ident"addr", arg)
+template csize_t(arg): untyped = nnkCall.newTree(ident"csize_t", arg)
+template pointer(arg): untyped = nnkCall.newTree(ident"pointer", arg)
+template arrayTyp(num, typ): untyped = nnkBracketExpr.newTree(ident"array", newLit num, typ)
+template lenOf(arg): untyped = nnkCall.newTree(ident"len", arg)
+template mul(x, y): untyped = nnkInfix.newTree(ident"*", x, y)
+
+proc getSizeOf(arg: NimNode): NimNode =
+  ## Returns a call to `sizeof` for the given argument. The argument to `sizeof` must
+  ## be the size of the data we copy. If the argument is a `seq` we take into account
+  ## the number of elements. If the input type is already given as a `ptr T` type, we
+  ## need the size of `T` and not `ptr`.
+  case arg.typeKind
+  of ntyPtr: result = size(arg.getTypeInst()[0])
+  of ntySequence: result = mul(lenOf(arg), size(arg.getTypeInst()[1]))
+  else: result = size(arg)
+
+proc maybeAddress(n: NimNode): NimNode =
+  ## Returns the address of the given node, *IFF* the type is not a
+  ## pointer type already. In case the input is a `seq[T]`, we return `x[0].addr`.
+  case n.typeKind
+  of ntyPtr: result = n
+  of ntySequence: result = address( nnkBracketExpr.newTree(n, newLit 0) )
+  else: result = address(n)
+
+proc genParams(pId, r, i: NimNode, iTypes: seq[NimNode], passStructByPointer: bool): NimNode =
+  ## Generates the parameter `params` variable
+  let ps = assembleParams(r, i, iTypes, passStructByPointer)
+  result = nnkBracket.newTree()
+  for p in ps:
+    result.add pointer(maybeAddress p)
+  result = nnkLetSection.newTree(
+    nnkIdentDefs.newTree(pId, arrayTyp(ps.len, ident"pointer"), result)
+  )
+
+proc genVar(n: NimNode): (NimNode, NimNode) =
+  ## Generates a let `tmp` variable and returns its identifier and
+  ## the let section.
+  result[0] = genSym(nskLet, "tmp")
+  result[1] = nnkLetSection.newTree(
+    nnkIdentDefs.newTree(
+      result[0],
+      getTypeInst(n),
+      n
+    )
+  )
+
+proc genLocalVars(inputs: NimNode): (NimNode, NimNode) =
+  result[0] = newStmtList() # defines local vars
+  result[1] = nnkBracket.newTree() # returns new bracket of vars for parameters
+  for el in inputs:
+    case el.kind
+    of nnkLiterals, nnkConstDef: # define a local with the value of it
+      let (s, v) = genVar(el)
+      result[0].add v
+      result[1].add s
+    of nnkSym:
+      if el.strVal in ["true", "false"]:
+        let (s, v) = genVar(el)
+        result[0].add v
+        result[1].add s
+      else:
+        result[1].add el # keep symbol
+    else:
+      result[1].add el # keep symbol
+
+proc maybeWrap(n: NimNode): NimNode =
+  if n.kind notin {nnkBracket, nnkTupleConstr}:
+    result = nnkBracket.newTree(n)
+  else:
+    result = n
+
+proc endianCheck(): NimNode =
+  result = quote do:
+    static: doAssert cpuEndian == littleEndian, block:
+      # From https://developer.nvidia.com/cuda-downloads?target_os=Linux
+      # Supported architectures for Cuda are:
+      # x86-64, PowerPC 64 little-endian, ARM64 (aarch64)
+      # which are all little-endian at word-level.
+      #
+      # Due to limbs being also stored in little-endian, on little-endian host
+      # the CPU and GPU will have the same binary representation
+      # whether we use 32-bit or 64-bit words, so naive memcpy can be used for parameter passing.
+
+      "Most CPUs (x86-64, ARM) are little-endian, as are Nvidia GPUs, which allows naive copying of parameters.\n" &
+      "Your architecture '" & $hostCPU & "' is big-endian and GPU offloading is unsupported on it."
+
+proc execCudaImpl*(jitFn, numBlocks, threadsPerBlock, res, inputs, sharedMemSize: NimNode,
+                   passStructByPointer: static bool): NimNode =
+  # Maybe wrap individually given arguments in a `[]` bracket, e.g.
+  # `execCuda(res = foo, inputs = bar)`
+  let res = maybeWrap res
+  let inputs = maybeWrap inputs
+
+  result = newStmtList()
+  result.add endianCheck()
+
+  # get the types of the inputs
+  let rTypes = getTypes(res)
+  let iTypes = getTypes(inputs)
+
+  # determine all required `CUdeviceptr`
+  let devPtrs = determineDevicePtrs(res, inputs, iTypes, passStructByPointer)
+
+  # generate device pointers, allocate memory and copy data
+  for x in devPtrs:
+    # `var rGPU: CUdeviceptr`
+    result.add nnkVarSection.newTree(
+      nnkIdentDefs.newTree(
+        x[0],
+        ident"CUdeviceptr",
+        newEmptyNode()
+      )
+    )
+
+    # `check cuMemAlloc(rGPU, csize_t sizeof(r))`
+    result.add(
+      check nnkCall.newTree(
+        ident"cuMemAlloc",
+        x[0],
+        csize_t getSizeOf(x[1])
+      )
+    )
+    # `check cuMemcpyHtoD(aGPU, a.addr, csize_t sizeof(a))`
+    result.add(
+      check nnkCall.newTree(
+        ident"cuMemcpyHtoD",
+        x[0],
+        maybeAddress x[1],
+        csize_t getSizeOf(x[1])
+      )
+    )
+
+  # Generate local variables
+  let (decl, vars) = genLocalVars(inputs)
+  result.add decl
+
+  # assemble the parameters
+  let pId = ident"params"
+  let params = genParams(pId, res, vars, iTypes, passStructByPointer)
+  result.add params
+
+  # launch the kernel
+  result.add quote do:
+    let pAr = if `pId`.len > 0: `pId`[0].unsafeAddr
+              else: nil
+
+    # Create timing events
+    var start, stop: cudaEvent_t
+    check cudaEventCreate(addr start)
+    check cudaEventCreate(addr stop)
+
+    check cudaEventRecord(start, nil)
+    check cuLaunchKernel(
+            CUfunction(`jitFn`),     # dummy conversion on NVRTC, required on LLVM
+            `numBlocks`, 1, 1,       # grid(x, y, z)
+            `threadsPerBlock`, 1, 1, # block(x, y, z)
+            sharedMemBytes = `sharedMemSize`.uint32,
+            CUstream(nil),
+            pAr, nil)
+    check cudaDeviceSynchronize()
+    check cudaEventRecord(stop, nil)
+    check cudaEventSynchronize(stop)
+
+    var elapsedTime: float32
+    check cudaEventElapsedTime(addr elapsedTime, start, stop)
+    echo "[INFO]: Kernel execution took: ", elapsedTime, " ms"
+
+    check cudaEventDestroy(start)
+    check cudaEventDestroy(stop)
+
+
+  # copy back results
+  let devPtrsRes = determineDevicePtrs(res, nnkBracket.newTree(), @[], passStructByPointer)
+  for x in devPtrsRes:
+    result.add(
+      check nnkCall.newTree(
+        ident"cuMemcpyDtoH",
+        maybeAddress x[1],
+        x[0],
+        csize_t getSizeOf(x[1])
+      )
+    )
+
+  # free memory
+  for x in devPtrs:
+    result.add(
+      check nnkCall.newTree(
+        ident"cuMemFree",
+        x[0]
+      )
+    )
+  result = quote do:
+    block:
+      `result`
+
+macro execCuda*(jitFn: CUfunction,
+                res: typed,
+                inputs: typed): untyped =
+  ## Given a CUDA function, execute the kernel. Copies all non trivial data types to
+  ## to the GPU via `cuMemcpyHtoD`. Any argument given as `res` will be copied back
+  ## from the GPU after kernel execution finishes.
+  ##
+  ## IMPORTANT:
+  ## The arguments passed to the CUDA kernel will be in the order in which they are
+  ## given to the macro. This especially means `res` arguments will be passed first.
+  ##
+  ## Example:
+  ## ```nim
+  ## execCuda(fn, res = [r, s], inputs = [a, b, c]) # if all arguments have the same type
+  ## # or
+  ## execCuda(fn, res = (r, s), inputs = (a, b, c)) # if different types
+  ## ```
+  ## will pass the parameters as `[r, s, a, b, c]`.
+  ##
+  ## For more examples see the test case `tests/gpu/t_exec_literals_consts.nim`.
+  ##
+  ## We do not perform any checks on whether the given types are valid as arguments to
+  ## the CUDA target! Also, all arguments given as `res` are expected to be copied.
+  ## To return a value for a simple data type, use a `ptr X` type. However, it is allowed
+  ## to simply pass a `var` symbol as a `res` argument. We automatically copy to the
+  ## the memory location.
+  ##
+  ## We also copy all `res` data to the GPU, so that a return value can also be used
+  ## as an input.
+  ##
+  ## NOTE: This function is mainly intended for convenient execution of a single kernel
+  result = execCudaImpl(jitFn, newLit 1, newLit 1, res, inputs, newLit 0, passStructByPointer = false)
+
+macro execCuda*(jitFn: CUfunction,
+                numBlocks, threadsPerBlock: int,
+                res: typed,
+                inputs: typed): untyped =
+  ## Overload which takes a target number of threads and blocks
+  result = execCudaImpl(jitFn, numBlocks, threadsPerBlock, res, inputs, newLit 0, passStructByPointer = false)
+
+macro execCuda*(jitFn: CUfunction,
+                numBlocks, threadsPerBlock: int,
+                res: typed,
+                inputs: typed,
+                sharedMemSize: typed): untyped =
+  ## Overload which takes a target number of threads and blocks and a shared memory size
+  result = execCudaImpl(jitFn, numBlocks, threadsPerBlock, res, inputs, sharedMemSize, passStructByPointer = false)
+
+macro execCuda*(jitFn: CUfunction,
+                res: typed): untyped =
+  ## Overload of the above for empty `inputs`
+  result = execCudaImpl(jitFn, newLit 1, newLit 1, res, nnkBracket.newTree(), newLit 0, passStructByPointer = false)
diff --git a/constantine/math_compiler/experimental/nim_ast_to_cuda_ast.nim b/constantine/math_compiler/experimental/nim_ast_to_cuda_ast.nim
new file mode 100644
index 000000000..4521e4f68
--- /dev/null
+++ b/constantine/math_compiler/experimental/nim_ast_to_cuda_ast.nim
@@ -0,0 +1,1142 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import std / [macros, strutils, sequtils, options, sugar, tables, strformat]
+
+type
+  GpuNodeKind = enum
+    gpuVoid         # Just an empty statement. Useful to not emit anything
+    gpuProc         # Function definition (both device and global)
+    gpuCall         # Function call
+    gpuTemplateCall # Call to a Nim template
+    gpuIf           # If statement
+    gpuFor          # For loop
+    gpuWhile        # While loop
+    gpuBinOp        # Binary operation
+    gpuVar          # Variable declaration
+    gpuAssign       # Assignment
+    gpuIdent        # Identifier
+    gpuLit          # Literal value
+    gpuArrayLit     # Literal array constructor `[1, 2, 3]`
+    gpuPrefix       # Prefix e.g. `-`
+    gpuBlock        # Block of statements
+    gpuReturn       # Return statement
+    gpuDot          # Member access (a.b)
+    gpuIndex        # Array indexing (a[b])
+    gpuTypeDef      # Type definition
+    gpuObjConstr    # Object (struct) constructor
+    gpuInlineAsm    # Inline assembly (PTX)
+    gpuAddr         # Address of an expression
+    gpuDeref        # Dereferences an expression
+    gpuCast         # Cast expression
+    gpuComment      # Just a comment
+    gpuConstexpr    # A `constexpr`, i.e. compile time constant (Nim `const`)
+
+  GpuTypeKind = enum
+    gtVoid,
+    gtBool, gtUint8, gtUint16, gtInt16, gtUint32, gtInt32, gtUint64, gtInt64, gtFloat32, gtFloat64, gtSize_t, # atomics
+    gtArray,     # Static array `array[N, dtype]` -> `dtype[N]`
+    gtString,
+    gtObject,    # Struct types
+    gtPtr,       # Pointer type, carries inner type
+    gtVoidPtr    # Opaque void pointer
+
+  GpuTypeField = object
+    name: string
+    typ: GpuType
+
+  GpuType = ref object
+    case kind: GpuTypeKind
+    of gtPtr: to: GpuType # points to `to`
+    of gtObject:
+      name: string
+      oFields: seq[GpuTypeField]
+    of gtArray:
+      aTyp: GpuType # the inner type (must be some atomic base type at the moment)
+      aLen: int     # The length of the array. If `aLen == -1` we look at a generic (static) array. Will be given at instantiation time
+    else: discard
+
+  GpuAttribute = enum
+    attDevice = "__device__"
+    attGlobal = "__global__"
+    attForceInline = "__forceinline__"
+
+  GpuVarAttribute = enum
+    atvExtern = "extern"
+    atvShared = "__shared__"
+    atvVolatile = "volatile"
+    atvConstant = "__constant__" # use `{.constant.}` pragma, e.g. `var foo {.constant.}`
+
+  GpuAst = ref object
+    case kind: GpuNodeKind
+    of gpuVoid: discard
+    of gpuProc:
+      pName: string
+      pRetType: GpuType
+      pParams: seq[tuple[name: string, typ: GpuType]]
+      pBody: GpuAst
+      pAttributes: set[GpuAttribute] # order not important, hence set
+    of gpuCall:
+      cName: string
+      cArgs: seq[GpuAst]
+    of gpuTemplateCall:
+      tcName: string
+      tcArgs: seq[GpuAst]  # Arguments for template instantiation
+    of gpuIf:
+      ifCond: GpuAst
+      ifThen: GpuAst
+      ifElse: Option[GpuAst]  # None if no else branch
+    of gpuFor:
+      fVar: string
+      fStart, fEnd: GpuAst
+      fBody: GpuAst
+    of gpuWhile:
+      wCond: GpuAst
+      wBody: GpuAst
+    of gpuBinOp:
+      bOp: string
+      bLeft, bRight: GpuAst
+    of gpuVar:
+      vName: string
+      vType: GpuType
+      vInit: GpuAst
+      vRequiresMemcpy: bool
+      vAttributes: seq[GpuVarAttribute] # order is important, hence seq
+    of gpuAssign:
+      aLeft, aRight: GpuAst
+      aRequiresMemcpy: bool
+    of gpuIdent:
+      iName: string
+    of gpuLit:
+      lValue: string
+      lType: GpuType
+    of gpuConstexpr:
+      cIdent: GpuAst # the identifier
+      cValue: GpuAst # not just a string to support different types easily
+      cType: GpuType
+    of gpuArrayLit:
+      aValues: seq[string]
+      aLitType: GpuType # type of first element
+    of gpuBlock:
+      blockLabel: string # optional name of the block. If any given, will open a `{ }` scope in CUDA
+      statements: seq[GpuAst]
+    of gpuReturn:
+      rValue: GpuAst
+    of gpuDot:
+      dParent: GpuAst
+      dField: GpuAst #string
+    of gpuIndex:
+      iArr: GpuAst
+      iIndex: GpuAst
+    of gpuPrefix:
+      pOp: string
+      pVal: GpuAst
+    of gpuTypeDef:
+      tName: string
+      tFields: seq[GpuTypeField]
+    of gpuObjConstr:
+      ocName: string # type we construct
+      ## XXX: it would be better if we already fill the fields with default values here
+      ocFields: seq[GpuFieldInit] # the fields we initialize
+    of gpuInlineAsm:
+      stmt: string
+    of gpuComment:
+      comment: string
+    of gpuCast:
+      cTo: GpuType # type to cast to
+      cExpr: GpuAst # expression we cast
+    of gpuAddr:
+      aOf: GpuAst
+    of gpuDeref:
+      dOf: GpuAst
+
+  GpuFieldInit = object
+    name: string
+    value: GpuAst
+
+  TemplateInfo = object
+    params: seq[string]
+    body: GpuAst
+
+  GpuContext = object
+    ## XXX: need table for generic invocations. Then when we encounter a type, need to map to
+    ## the specific version
+    ## However, also need to keep every *generic procedure*. In their bodies the types are
+    ## only defined once they are called after all.
+    skipSemicolon: bool # whether we *currently* add semicolons at the end of a block or not
+    templates: Table[string, TemplateInfo]  # Maps template names to their info
+
+template nimonly*(): untyped {.pragma.}
+template cudaName*(s: string): untyped {.pragma.}
+
+
+proc `$`(x: GpuType): string =
+  if x == nil:
+    result = "GpuType(nil)"
+  else:
+    result = $x[]
+
+proc nimToGpuType(n: NimNode): GpuType
+
+proc initGpuType(kind: GpuTypeKind): GpuType =
+  ## If `kind` is `gtPtr` `to` must be the type we point to
+  if kind in [gtObject, gtPtr, gtArray]: raiseAssert "Objects/Pointers/Arrays must be constructed using `initGpuPtr/Object/ArrayType` "
+  result = GpuType(kind: kind)
+
+proc initGpuPtrType(to: GpuType): GpuType =
+  ## If `kind` is `gtPtr` `to` must be the type we point to
+  result = GpuType(kind: gtPtr, to: to)
+
+proc initGpuVoidPtr(): GpuType =
+  result = GpuType(kind: gtVoidPtr)
+
+proc initGpuObjectType(name: string, flds: seq[GpuTypeField]): GpuType =
+  ## If `kind` is `gtPtr` `to` must be the type we point to
+  result = GpuType(kind: gtObject, name: name, oFields: flds)
+
+proc initGpuArrayType(aTyp: NimNode, len: int): GpuType =
+  ## Construct an statically sized array type
+  result = GpuType(kind: gtArray, aTyp: nimToGpuType(aTyp), aLen: len)
+
+proc toGpuTypeKind(t: NimTypeKind): GpuTypeKind =
+  case t
+  #of ntyBool, ntyChar:
+    # , ntyEmpty, ntyAlias, ntyNil, ntyExpr, ntyStmt, ntyTypeDesc, ntyGenericInvocation, ntyGenericBody, ntyGenericInst, ntyGenericParam, ntyDistinct, ntyEnum, ntyOrdinal, ntyArray, ntyObject, ntyTuple, ntySet, ntyRange, ntyPtr, ntyRef, ntyVar, ntySequence, ntyProc,
+  #of ntyPointer, ntyUncheckedArray, ntyOpenArray, ntyString, ntyCString
+  # , ntyForward, ntyInt, ntyInt8,
+  of ntyBool: gtBool
+  of ntyInt16: gtInt16
+  of ntyInt32: gtInt32
+  of ntyInt64: gtInt64
+  of ntyInt:   gtInt64
+  of ntyFloat: gtFloat64
+  of ntyFloat32: gtFloat32
+  of ntyFloat64: gtFloat64
+  #of ntyFloat128: gtFloat128
+  of ntyUInt: gtUint64
+  of ntyUInt8: gtUint8
+  of ntyUInt16: gtUint16
+  of ntyUInt32: gtUint32
+  of ntyUInt64: gtUint64
+  else:
+    raiseAssert "Not supported yet: " & $t
+
+proc unpackGenericInst(t: NimNode): NimNode =
+  let tKind = t.typeKind
+  if tKind == ntyGenericInst:
+    let impl = t.getTypeImpl()
+    case impl.kind
+    of nnkDistinctTy: # just skip the distinct
+      result = impl[0]
+    else:
+      raiseAssert "Unsupport type so far: " & $t.treerepr & " of impl: " & $impl.treerepr
+  else:
+    result = t
+
+proc toGpuTypeKind(t: NimNode): GpuTypeKind =
+  result = t.unpackGenericInst().typeKind.toGpuTypeKind()
+
+proc getInnerPointerType(n: NimNode): GpuType =
+  doAssert n.typeKind in {ntyPtr, ntyPointer, ntyUncheckedArray, ntyVar} or n.kind == nnkPtrTy, "But was: " & $n.treerepr & " of typeKind " & $n.typeKind
+  if n.typeKind in {ntyPointer, ntyUncheckedArray}:
+    let typ = n.getTypeInst()
+    doAssert typ.kind == nnkBracketExpr, "No, was: " & $typ.treerepr
+    doAssert typ[0].kind in {nnkIdent, nnkSym}
+    doAssert typ[0].strVal in ["ptr", "UncheckedArray"]
+    result = nimToGpuType(typ[1])
+  elif n.kind == nnkPtrTy:
+    result = nimToGpuType(n[0])
+  elif n.kind == nnkAddr:
+    let typ = n.getTypeInst()
+    result = getInnerPointerType(typ)
+  elif n.kind == nnkVarTy:
+    # VarTy
+    #   Sym "BigInt"
+    result = nimToGpuType(n[0])
+  else:
+    raiseAssert "Found what: " & $n.treerepr
+
+proc determineArrayLength(n: NimNode): int =
+  case n[1].kind
+  of nnkSym:
+    # likely a constant, try to get its value
+    result = n[1].getImpl.intVal
+  of nnkIdent:
+    let msg = """Found array with length given by identifier: $#!
+You might want to create a typed template taking a typed parameter for this
+constant to force the Nim compiler to bind the symbol.
+""" % n[1].strVal
+    raiseAssert msg
+  else:
+    case n[1].kind
+    of nnkIntLit: result = n[1].intVal
+    else:
+      #doAssert n[1].kind == nnkIntLit, "No is: " & $n.treerepr
+      doAssert n[1].kind == nnkInfix, "No is: " & $n.treerepr
+      doAssert n[1][1].kind == nnkIntLit, "No is: " & $n.treerepr
+      doAssert n[1][1].intVal == 0, "No is: " & $n.treerepr
+      result = n[1][2].intVal + 1
+
+proc getTypeName(n: NimNode): string =
+  ## Returns the name of the type
+  case n.kind
+  of nnkIdent, nnkSym: result = n.strVal
+  of nnkObjConstr:  result = n.getTypeInst.strVal
+  else: raiseAssert "Unexpected node in `getTypeName`: " & $n.treerepr
+
+proc parseTypeFields(node: NimNode): seq[GpuTypeField]
+proc nimToGpuType(n: NimNode): GpuType =
+  ## Maps a Nim type to a type on the GPU
+  case n.kind
+  of nnkIdentDefs: # extract type for let / var based on explicit or implicit type
+    if n[n.len - 2].kind != nnkEmpty: # explicit type
+      result = nimToGpuType(n[n.len - 2])
+    else: # take from last element
+      result = nimToGpuType(n[n.len - 1].getTypeInst())
+  of nnkConstDef:
+    if n[1].kind != nnkEmpty: # has an explicit type
+      result = nimToGpuType(n[1])
+    else:
+      result = nimToGpuType(n[2]) # derive from the RHS literal
+  else:
+    if n.kind == nnkEmpty: return initGpuType(gtVoid)
+    case n.typeKind
+    of ntyBool, ntyInt .. ntyUint64: # includes all float types
+      result = initGpuType(toGpuTypeKind n.typeKind)
+    of ntyPtr, ntyVar:
+      result = initGpuPtrType(getInnerPointerType(n))
+    of ntyPointer:
+      result = initGpuVoidPtr()
+    of ntyUncheckedArray:
+      ## Note: this is just the internal type of the array. It is only a pointer due to
+      ## `ptr UncheckedArray[T]`. We simply remove the `UncheckedArray` part.
+      result = getInnerPointerType(n)
+    of ntyObject:
+      let impl = n.getTypeImpl
+      let flds = impl.parseTypeFields()
+      let typName = getTypeName(n) # might be an object construction
+      result = initGpuObjectType(typName, flds)
+    of ntyArray:
+      # For a generic, static array type, e.g.:
+      if n.kind == nnkSym:
+        return nimToGpuType(getTypeImpl(n))
+      if n.len == 3:
+        # BracketExpr
+        #   Sym "array"
+        #   Ident "N"
+        #   Sym "uint32"
+        doAssert n.len == 3, "Length was not 3, but: " & $n.len & " for node: " & n.treerepr
+        doAssert n[0].strVal == "array"
+        let len = determineArrayLength(n)
+        result = initGpuArrayType(n[2], len)
+      else:
+        # just an array literal
+        # Bracket
+        #   UIntLit 2013265921
+        let len = n.len
+        result = initGpuArrayType(n[0], len)
+    #of ntyCompositeTypeClass:
+    #  echo n.getTypeImpl.treerepr
+    #  error("o")
+    of ntyGenericInvocation:
+      result = initGpuType(gtVoid)
+      error("Generics are not supported in the CUDA DSL so far.")
+    of ntyGenericInst:
+      result = n.unpackGenericInst().nimToGpuType()
+    else: raiseAssert "Type : " & $n.typeKind & " not supported yet: " & $n.treerepr
+
+proc assignOp(op: string, isBoolean: bool): string =
+  ## Returns the correct CUDA operation given the Nim operator.
+  ## This is to replace things like `shl`, `div` or `mod`
+  case op
+  of "div": result = "/"
+  of "mod": result = "%"
+  of "shl": result = "<<"
+  of "shr": result = ">>"
+  of "and": result = if isBoolean: "&&" else: "&" # bitwise OR
+  of "or":  result = if isBoolean: "||" else: "|" # bitwise OR
+  of "xor": result = "^"
+  else: result = op
+
+proc assignPrefixOp(op: string): string =
+  ## Returns the correct CUDA operation given the Nim operator.
+  case op
+  of "not": result = "!"
+  else: result = op
+
+proc parseTypeFields(node: NimNode): seq[GpuTypeField] =
+  doAssert node.kind == nnkObjectTy
+  doAssert node[2].kind == nnkRecList
+  for ch in node[2]:
+    doAssert ch.kind == nnkIdentDefs and ch.len == 3
+    result.add GpuTypeField(name: ch[0].strVal,
+                            typ: nimToGpuType(ch[1]))
+
+template findIdx(col, el): untyped =
+  var res = -1
+  for i, it in col:
+    if it.name == el:
+      res = i
+      break
+  res
+
+proc ensureBlock(ast: GpuAst): GpuAst =
+  ## Ensures the body is a block, e.g. if single statement in a for loop, we want the
+  ## body to be a block regardless.
+  if ast.kind == gpuBlock: ast
+  else: GpuAst(kind: gpuBlock, statements: @[ast])
+
+proc requiresMemcpy(n: NimNode): bool =
+  ## At the moment we only emit a `memcpy` statement for array types
+  result = n.typeKind == ntyArray and n.kind != nnkBracket # need to emit a memcpy
+
+proc getFnName(n: NimNode): string =
+  ## Returns the name for the function. Either the symbol name _or_
+  ## the `{.cudaName.}` pragma argument.
+  # check if the implementation has a pragma
+  if n.kind == nnkSym:
+    # Check if `cudaName` pragma used:
+    # ProcDef
+    #   Sym "syncthreads"
+    #   Empty
+    #   Empty
+    #   FormalParams
+    #     Empty
+    #   Pragma
+    #     ExprColonExpr
+    #       Sym "cudaName"           <- if this exists
+    #       StrLit "__syncthreads"   <- use this name
+    #   Empty
+    #   DiscardStmt
+    #     Empty
+    let impl = n.getImpl
+    if impl.kind in [nnkProcDef, nnkFuncDef]:
+      let pragma = impl.pragma
+      if pragma.kind != nnkEmpty and pragma[0].kind == nnkExprColonExpr:
+        if pragma[0][0].kind in [nnkIdent, nnkSym] and pragma[0][0].strVal == "cudaName":
+          return pragma[0][1].strVal # return early to avoid lots of branches
+  # else we use the str representation (repr for open / closed sym choice nodes)
+  result = n.repr
+
+
+
+proc collectProcAttributes(n: NimNode): set[GpuAttribute] =
+  doAssert n.kind == nnkPragma
+  for pragma in n:
+    doAssert pragma.kind in [nnkIdent, nnkSym], "Unexpected node kind: " & $pragma.treerepr
+    case pragma.strVal
+    of "device": result.incl attDevice
+    of "global": result.incl attGlobal
+    of "forceinline": result.incl attForceInline
+    of "nimonly":
+      # used to fully ignore functions!
+      return
+    else:
+      raiseAssert "Unexpected pragma for procs: " & $pragma.treerepr
+
+proc collectAttributes(n: NimNode): seq[GpuVarAttribute] =
+  ## Collects all pragmas associated with the given variable.
+  ## Takes the `nnkPragma` node of the `nnkIdentDefs` associated with it.
+  # Example AST with multiple pragmas
+  # IdentDefs
+  #   PragmaExpr
+  #     Sym "sharedMem"
+  #     Pragma
+  #       Sym "cuExtern"
+  #       Sym "shared"
+  #   BracketExpr
+  #     Sym "array"
+  #     IntLit 0
+  #     Sym "BigInt"
+  #   Empty
+  doAssert n.kind == nnkPragma
+  for pragma in n:
+    doAssert pragma.kind in [nnkIdent, nnkSym], "Unexpected node kind: " & $pragma.treerepr
+    # NOTE: We don't use `parseEnum`, because on the Nim side some of the attributes
+    # do not match the CUDA string we need to emit, which is what the string value of
+    # the `GpuVarAttribute` enum stores
+    case pragma.strVal
+    of "cuExtern", "extern": result.add atvExtern
+    of "shared": result.add atvShared
+    of "volatile": result.add atvVolatile
+    of "constant": result.add atvConstant
+    else:
+      raiseAssert "Unexpected pragma: " & $pragma.treerepr
+
+proc toGpuAst(ctx: var GpuContext, node: NimNode): GpuAst =
+  ## XXX: things still left to do:
+  ## - support `result` variable? Currently not supported. Maybe we will won't
+
+  #echo node.treerepr
+  case node.kind
+  of nnkEmpty: result = GpuAst(kind: gpuVoid) # nothing to do
+  of nnkStmtList:
+    result = GpuAst(kind: gpuBlock)
+    for el in node:
+      result.statements.add ctx.toGpuAst(el)
+  of nnkBlockStmt:
+    # BlockStmt
+    #   Sym "unrolledIter_i0"  <- ignore the block label for now!
+    #   Call
+    #     Sym "printf"
+    #     StrLit "i = %u\n"
+    #     IntLit 0
+    let blockLabel = if node[0].kind in {nnkSym, nnkIdent}: node[0].strVal
+                     elif node[0].kind == nnkEmpty: ""
+                     else: raiseAssert "Unexpected node in block label field: " & $node.treerepr
+    result = GpuAst(kind: gpuBlock,
+                    blockLabel: blockLabel)
+    for i in 1 ..< node.len: # index 0 is the block label
+      result.statements.add ctx.toGpuAst(node[i])
+  of nnkStmtListExpr: # for statements that return a value.
+    ## XXX: For CUDA just a block?
+    result = GpuAst(kind: gpuBlock)
+    for el in node:
+      if el.kind != nnkEmpty:
+        result.statements.add ctx.toGpuAst(el)
+  of nnkDiscardStmt:
+    # just process the child node if any
+    result = ctx.toGpuAst(node[0])
+
+  of nnkProcDef, nnkFuncDef:
+    result = GpuAst(kind: gpuProc)
+    result.pName = node.name.strVal
+    doAssert node[3].kind == nnkFormalParams
+    result.pRetType = nimToGpuType(node[3][0]) # arg 0 is return type
+    # Process parameters
+    for i in 1 ..< node[3].len:
+      let param = node[3][i]
+      let numParams = param.len - 2 # 3 if one param, one more for each of same type, example:
+      let typIdx = param.len - 2 # second to last is the type
+      # IdentDefs
+      #   Ident "x"
+      #   Ident "y"
+      #   Ident "res"
+      #   PtrTy
+      #     Ident "float32"   # `param.len - 2`
+      #   Empty               # `param.len - 1`
+      let paramType = nimToGpuType(param[typIdx])
+      for i in 0 ..< numParams:
+        result.pParams.add((param[i].strVal, paramType))
+
+    # Process pragmas
+    if node.pragma.kind != nnkEmpty:
+      doAssert node.pragma.len > 0, "Pragma kind non empty, but no pragma?"
+      result.pAttributes = collectProcAttributes(node.pragma)
+      if result.pAttributes.len == 0: # means `nimonly` was applied
+        return GpuAst(kind: gpuVoid)
+
+    result.pBody = ctx.toGpuAst(node.body)
+      .ensureBlock() # single line procs should be a block to generate `;`
+
+  of nnkLetSection, nnkVarSection:
+    # For a section with multiple declarations, create a block
+    result = GpuAst(kind: gpuBlock)
+    for declaration in node:
+      # Each declaration gets converted to a gpuVar
+      var varNode = GpuAst(kind: gpuVar)
+      case declaration[0].kind
+      of nnkIdent, nnkSym:
+        # IdentDefs               # declaration
+        #   Sym "res"             # declaration[0]
+        #   Sym "uint32"
+        #   Empty
+        varNode.vName = declaration[0].strVal
+      of nnkPragmaExpr:
+        # IdentDefs               # declaration
+        #   PragmaExpr            # declaration[0]
+        #     Sym "res"           # declaration[0][0]
+        #     Pragma              # declaration[0][1]
+        #       Ident "volatile"
+        #   Sym "uint32"
+        #   Empty
+        varNode.vName = declaration[0][0].strVal
+        doAssert declaration[0][1].kind == nnkPragma
+        varNode.vAttributes = collectAttributes(declaration[0][1])
+      else: raiseAssert "Unexpected node kind for variable: " & $declaration.treeRepr
+      varNode.vType = nimToGpuType(declaration)
+      ## XXX: handle initialization for array types. Need a memcpy!
+      ## In principle should be straightforward. Turn e.g.
+      ## ```nim
+      ## let someData: array[8, uint32] = foo()
+      ## let x = BigInt(limbs: someData)
+      ## ```
+      ## into
+      ## ```cuda
+      ## unsigned int someData[8] = foo();
+      ## BigInt x = {{}};
+      ## memcpy((&x.limbs), (&someData), sizeof(unsigned int) * 8);
+      ## ```
+      ## Or something along those lines.
+      if declaration.len > 2 and declaration[2].kind != nnkEmpty:  # Has initialization
+        varNode.vInit = ctx.toGpuAst(declaration[2])
+        varNode.vRequiresMemcpy = requiresMemcpy(declaration[2])
+      result.statements.add(varNode)
+
+  of nnkAsgn:
+    result = GpuAst(kind: gpuAssign)
+    result.aLeft = ctx.toGpuAst(node[0])
+    result.aRight = ctx.toGpuAst(node[1])
+    result.aRequiresMemcpy = requiresMemcpy(node[1])
+
+  of nnkIfStmt:
+    result = GpuAst(kind: gpuIf)
+    let branch = node[0]  # First branch
+    result.ifCond = ctx.toGpuAst(branch[0])
+    result.ifThen = ensureBlock ctx.toGpuAst(branch[1])
+    if node.len > 1 and node[^1].kind == nnkElse:
+      result.ifElse = some(ensureBlock ctx.toGpuAst(node[^1][0]))
+
+  of nnkForStmt:
+    result = GpuAst(kind: gpuFor)
+    result.fVar = node[0].strVal
+    # Assuming range expression
+    result.fStart = ctx.toGpuAst(node[1][1])
+    result.fEnd = ctx.toGpuAst(node[1][2])
+    result.fBody = ensureBlock ctx.toGpuAst(node[2])
+  of nnkWhileStmt:
+    result = GpuAst(kind: gpuWhile)
+    result.wCond = ctx.toGpuAst(node[0]) # the condition
+    result.wBody = ensureBlock ctx.toGpuAst(node[1])
+
+  of nnkTemplateDef:
+    ## NOTE: Currently we process templates, but we expect them to be already
+    ## expanded by the Nim compiler. Thus we could in theory expand them manually
+    ## but fortunately we don't need to.
+    let tName = node[0].strVal
+
+    # Extract parameters
+    var tParams = newSeq[string]()
+    for i in 1 ..< node[3].len:
+      let param = node[3][i]
+      tParams.add param[0].strVal
+    # and the body
+    let tBody = ctx.toGpuAst(node.body)
+
+    # Store template in context
+    ctx.templates[tName] = TemplateInfo(
+      params: tParams,
+      body: tBody
+    )
+
+    result = GpuAst(kind: gpuVoid)
+
+  of nnkCall, nnkCommand:
+    # Check if this is a template call
+    let name = getFnName(node[0]) # cannot use `strVal`, might be a symchoice
+    let args = node[1..^1].mapIt(ctx.toGpuAst(it))
+    # Producing a template call something like this (but problematic due to overloads etc)
+    # we could then perform manual replacement of the template in the CUDA generation pass.
+    if false: #  name in ctx.templates: #
+      result = GpuAst(kind: gpuTemplateCall)
+      result.tcName = name
+      result.tcArgs = args
+    else:
+      result = GpuAst(kind: gpuCall)
+      result.cName = name
+      result.cArgs = args
+
+  of nnkInfix:
+    result = GpuAst(kind: gpuBinOp)
+    # if left/right is boolean we need logical AND/OR, otherwise
+    # bitwise
+    let isBoolean = node[1].typeKind == ntyBool
+    result.bOp = assignOp(node[0].repr, isBoolean) # repr so that open sym choice gets correct name
+    result.bLeft = ctx.toGpuAst(node[1])
+    result.bRight = ctx.toGpuAst(node[2])
+
+  of nnkDotExpr:
+    result = GpuAst(kind: gpuDot)
+    result.dParent = ctx.toGpuAst(node[0])
+    result.dField = ctx.toGpuAst(node[1])
+
+  of nnkBracketExpr:
+    result = GpuAst(kind: gpuIndex)
+    result.iArr = ctx.toGpuAst(node[0])
+    result.iIndex = ctx.toGpuAst(node[1])
+
+  of nnkIdent, nnkSym, nnkOpenSymChoice:
+    result = GpuAst(kind: gpuIdent)
+    result.iName = node.repr # for sym choices
+
+  # literal types
+  of nnkIntLit, nnkInt32Lit:
+    result = GpuAst(kind: gpuLit)
+    result.lValue = $node.intVal
+    result.lType = initGpuType(gtInt32)
+  of nnkUInt32Lit:
+    result = GpuAst(kind: gpuLit)
+    result.lValue = $node.intVal
+    result.lType = initGpuType(gtUInt32)
+  of nnkFloat64Lit, nnkFloatLit:
+    result = GpuAst(kind: gpuLit)
+    result.lValue = $node.floatVal & "f"
+    result.lType = initGpuType(gtFloat64)
+  of nnkFloat32Lit:
+    result = GpuAst(kind: gpuLit)
+    result.lValue = $node.floatVal & "f"
+    result.lType = initGpuType(gtFloat32)
+  of nnkRStrLit:
+    result = GpuAst(kind: gpuLit)
+    result.lValue = node.strVal
+    result.lType = initGpuType(gtString)
+  of nnkStrLit:
+    # For regular string literals escape them (but don't prefix/suffix with `"`)
+    result = GpuAst(kind: gpuLit)
+    result.lValue = node.strVal.escape("", "")
+    result.lType = initGpuType(gtString)
+  of nnkNilLit:
+    result = GpuAst(kind: gpuLit)
+    result.lValue = "NULL"
+    result.lType = initGpuVoidPtr()
+
+  of nnkPar:
+    if node.len == 1: # just take body
+      result = ctx.toGpuAst(node[0])
+    else:
+      error("`nnkPar` with more than one argument currently not supported. Got: " & $node.treerepr)
+
+  of nnkReturnStmt:
+    if node[0].kind == nnkAsgn and node[0][0].strVal == "result":
+      # skip the result and just get the RHS
+      result = GpuAst(kind: gpuReturn,
+                      rValue: ctx.toGpuAst(node[0][1]))
+    else:
+      result = GpuAst(kind: gpuReturn,
+                      rValue: ctx.toGpuAst(node[0]))
+
+  of nnkPrefix:
+    result = GpuAst(kind: gpuPrefix,
+                    pVal: ctx.toGpuAst(node[1]))
+    result.pOp = assignPrefixOp(node[0].strVal)
+
+  of nnkTypeSection:
+    result = GpuAst(kind: gpuBlock)
+    for el in node: # walk each type def
+      doAssert el.kind == nnkTypeDef
+      result.statements.add ctx.toGpuAst(el)
+  of nnkTypeDef:
+    result = GpuAst(kind: gpuTypeDef, tName: node[0].strVal)
+    result.tFields = parseTypeFields(node[2])
+  of nnkObjConstr:
+    let typName = getTypeName(node)
+    result = GpuAst(kind: gpuObjConstr, ocName: typName)
+    # get all fields of the type
+    let flds = node[0].getTypeImpl.parseTypeFields() # sym
+    # find all fields that have been defined by the user
+    var ocFields: seq[GpuFieldInit]
+    for i in 1 ..< node.len: # all fields to be init'd
+      doAssert node[i].kind == nnkExprColonExpr
+      ocFields.add GpuFieldInit(name: node[i][0].strVal,
+                                value: ctx.toGpuAst(node[i][1]))
+    # now add fields in order of the type declaration
+    for i in 0 ..< flds.len:
+      let idx = findIdx(ocFields, flds[i].name)
+      if idx >= 0:
+        result.ocFields.add ocFields[idx]
+      else:
+        let dfl = GpuAst(kind: gpuLit, lValue: "DEFAULT", lType: GpuType(kind: gtVoid))
+        result.ocFields.add GpuFieldInit(name: flds[i].name,
+                                         value: dfl)
+
+  of nnkAsmStmt:
+    doAssert node.len == 2
+    doAssert node[0].kind == nnkEmpty
+    result = GpuAst(kind: gpuInlineAsm,
+                    stmt: node[1].strVal)
+
+  of nnkBracket:
+    let aLitTyp = nimToGpuType(node[0])
+    var aValues = newSeq[string]()
+    for el in node:
+      ## XXX: do not use `repr`, e.g. if `1'u32` we'll get the `'u32` suffix
+      aValues.add $el.intVal
+    result = GpuAst(kind: gpuArrayLit,
+                    aValues: aValues,
+                    aLitType: aLitTyp)
+
+  of nnkCommentStmt:
+    result = GpuAst(kind: gpuComment, comment: node.strVal)
+
+  of nnkHiddenStdConv:
+    doAssert node[0].kind == nnkEmpty
+    result = ctx.toGpuAst(node[1])
+  of nnkCast, nnkConv:
+    # also map type conversion, e.g. `let i: int = 5; i.uint32` to a cast
+    result = GpuAst(kind: gpuCast, cTo: nimToGpuType(node[0]), cExpr: ctx.toGpuAst(node[1]))
+
+  of nnkAddr, nnkHiddenAddr:
+    # `HiddenAddr` appears for accesses to `var` passed arguments
+    result = GpuAst(kind: gpuAddr, aOf: ctx.toGpuAst(node[0]))
+
+  of nnkHiddenDeref:
+    case node.typeKind
+    of ntyUncheckedArray:
+      # `getTypeInst(node)` would yield:
+      # BracketExpr
+      #   Sym "UncheckedArray"
+      #   Sym "uint32"
+      # i.e. it is a `ptr UncheckedArray[T]`
+      # In this case we just ignore the deref, because on the CUDA
+      # side it is just a plain pointer array we index into using
+      # `foo[i]`.
+      result = ctx.toGpuAst(node[0])
+    else:
+      # Otherwise we treat it like a regular deref
+      # HiddenDeref
+      #   Sym "x"
+      # With e.g. `getTypeInst(node) = Sym "BigInt"`
+      # and `node.typeKind = ntyObject`
+      # due to a `var` parameter
+      result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0]))
+  of nnkDerefExpr: #, nnkHiddenDeref:
+    result = GpuAst(kind: gpuDeref, dOf: ctx.toGpuAst(node[0]))
+
+  of nnkConstDef:
+    result = GpuAst(kind: gpuConstexpr,
+                    cIdent: ctx.toGpuAst(node[0]),
+                    cValue: ctx.toGpuAst(node[2]),
+                    cType: nimToGpuType(node))
+  of nnkConstSection:
+    result = GpuAst(kind: gpuBlock)
+    for el in node: # walk each type def
+      doAssert el.kind == nnkConstDef
+      result.statements.add ctx.toGpuAst(el)
+
+  else:
+    echo "Unhandled node kind in toGpuAst: ", node.kind
+    raiseAssert "Unhandled node kind in toGpuAst: " & $node.treerepr
+    result = GpuAst(kind: gpuBlock)
+
+proc gpuTypeToString(t: GpuTypeKind): string =
+  case t
+  of gtBool: "bool"
+  of gtUint8: "unsigned char"
+  of gtUint16: "unsigned short"
+  of gtUint32: "unsigned int"
+  of gtUint64: "unsigned long long"
+  of gtInt16: "short"
+  of gtInt32: "int"
+  of gtInt64: "long long"
+  of gtFloat32: "float"
+  of gtFloat64: "double"
+  of gtVoid: "void"
+  of gtSize_t: "size_t"
+  of gtPtr: "*"
+  of gtVoidPtr: "void*"
+  of gtObject: "struct"
+  of gtString: "const char*"
+  else:
+    raiseAssert "Invalid type : " & $t
+
+
+proc gpuTypeToString(t: GpuType, ident: string = "", allowArrayToPtr = false, allowEmptyIdent = false): string
+proc getInnerArrayType(t: GpuType): string =
+  ## Returns the name of the inner most type for a nested array.
+  case t.kind
+  of gtArray:
+    result = getInnerArrayType(t.aTyp)
+  else:
+    result = gpuTypeToString(t)
+
+proc getInnerArrayLengths(t: GpuType): string =
+  ## Returns the lengths of the inner array types for a nested array.
+  case t.kind
+  of gtArray:
+    let inner = getInnerArrayLengths(t.aTyp)
+    result = &"[{$t.aLen}]"
+    if inner.len > 0:
+      result.add &"{inner}"
+  else:
+    result = ""
+
+proc gpuTypeToString(t: GpuType, ident: string = "", allowArrayToPtr = false,
+                     allowEmptyIdent = false,
+                    ): string =
+  ## Given an optional identifier required for array types
+  ##
+  ## XXX: we don't support this at the moment, it occured to me as something that
+  ## could be useful sometimes...
+  ## If `allowArrayToPtr` we allow casting a statically sized array to a pointer
+  var skipIdent = false
+  case t.kind
+  of gtPtr:
+    if t.to.kind == gtArray: # ptr to array type
+      # need to pass `*` for the pointer into the identifier, i.e.
+      # `state: var array[4, BigInt]`
+      # must become
+      # `BigInt (*state)[4]`
+      # so as our ident we pass `theIdent = (*<ident>)` and generate the type for the internal
+      # array type, which yields e.g. `BigInt <theIdent>[4]`.
+      let ptrStar = gpuTypeToString(t.kind)
+      result = gpuTypeToString(t.to, "(" & ptrStar & ident & ")")
+      skipIdent = true
+    else:
+      let typ = gpuTypeToString(t.to, allowEmptyIdent = allowEmptyIdent)
+      let ptrStar = gpuTypeToString(t.kind)
+      result = typ & ptrStar
+  of gtArray:
+    # empty idents happen in e.g. function return types or casts
+    if ident.len == 0 and not allowEmptyIdent: # and not allowArrayToPtr:
+      error("Invalid call, got an array type but don't have an identifier: " & $t)
+    case t.aTyp.kind
+    of gtArray: # nested array
+      let typ = getInnerArrayType(t)        # get inner most type
+      let lengths = getInnerArrayLengths(t) # get lengths as `[X][Y][Z]...`
+      result = typ & " " & ident & lengths
+    else:
+      # NOTE: Nested arrays don't have an inner identifier!
+      if t.aLen == 0: ## XXX: for the moment for 0 length arrays we generate flexible arrays instead
+        result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[]"
+      else:
+        result = gpuTypeToString(t.aTyp, allowEmptyIdent = allowEmptyIdent) & " " & ident & "[" & $t.aLen & "]"
+    skipIdent = true
+  of gtObject: result = t.name
+  else:        result = gpuTypeToString(t.kind)
+
+  if ident.len > 0 and not skipIdent: # still need to add ident
+    result.add " " & ident
+
+proc genFunctionType(typ: GpuType, fn: string, fnArgs: string): string =
+  ## Returns the correct function with its return type
+  if typ.kind == gtPtr and typ.to.kind == gtArray:
+    # crazy stuff. Syntax to return a pointer to a statically sized array:
+    # `Foo (*fnName(fnArgs))[ArrayLen]`
+    # where the return type is actually:
+    # `Foo (*)[ArrayLen]` (which already is hideous)
+    let arrayTyp = typ.to.aTyp
+    let innerTyp = gpuTypeToString(arrayTyp, allowEmptyIdent = true)
+    let innerLen = $typ.to.aLen
+    result = &"{innerTyp} (*{fn}({fnArgs}))[{innerLen}]"
+  else:
+    # normal stuff
+    result = &"{gpuTypeToString(typ, allowEmptyIdent = true)} {fn}({fnArgs})"
+
+proc genCuda(ctx: var GpuContext, ast: GpuAst, indent = 0): string
+
+proc address(a: string): string = "&" & a
+proc address(ctx: var GpuContext, a: GpuAst): string = address(ctx.genCuda(a))
+
+proc size(a: string): string = "sizeof(" & a & ")"
+proc size(ctx: var GpuContext, a: GpuAst): string = size(ctx.genCuda(a))
+proc size(ctx: var GpuContext, a: GpuType): string = size(gpuTypeToString(a, allowEmptyIdent = true))
+
+proc genMemcpy(lhs, rhs, size: string): string =
+  result = &"memcpy({lhs}, {rhs}, {size})"
+
+template withoutSemicolon(ctx: var GpuContext, body: untyped): untyped =
+  ctx.skipSemicolon = true
+  body
+  ctx.skipSemicolon = false
+
+proc genCuda(ctx: var GpuContext, ast: GpuAst, indent = 0): string =
+  let indentStr = "  ".repeat(indent)
+  #echo "At: ", ast.repr, " SKIP SEMICOLON: ", ctx.skipSemicolon
+
+  case ast.kind
+  of gpuVoid: return # nothing to emit
+  of gpuProc:
+    let attrs = collect:
+      for att in ast.pAttributes:
+        $att
+
+    # Parameters
+    var params: seq[string]
+    for (name, typ) in ast.pParams:
+      params.add gpuTypeToString(typ, name, allowEmptyIdent = false)
+    let fnArgs = params.join(", ")
+    let fnSig = genFunctionType(ast.pRetType, ast.pName, fnArgs)
+
+    # extern "C" is needed to avoid name mangling
+    result = indentStr & "extern \"C\" " & attrs.join(" ") & " " &
+             fnSig & "{\n"
+
+    result &= ctx.genCuda(ast.pBody, indent + 1)
+    result &= "\n" & indentStr & "}"
+
+  of gpuBlock:
+    result = ""
+    if ast.blockLabel.len > 0:
+      result.add "\n" & indentStr & "{ // " & ast.blockLabel & "\n"
+    for i, el in ast.statements:
+      result.add ctx.genCuda(el, indent)
+      if el.kind != gpuBlock and not ctx.skipSemicolon: # nested block ⇒ ; already added
+        result.add ";"
+      if i < ast.statements.high:
+        result.add "\n"
+    if ast.blockLabel.len > 0:
+      result.add "\n" & indentStr & "} // " & ast.blockLabel & "\n"
+
+  of gpuVar:
+    result = indentStr & ast.vAttributes.join(" ") & " " & gpuTypeToString(ast.vType, ast.vName)
+    # If there is an initialization, the type might require a memcpy
+    if ast.vInit != nil and not ast.vRequiresMemcpy:
+      result &= " = " & ctx.genCuda(ast.vInit)
+    elif ast.vInit != nil:
+      result.add ";\n"
+      result.add indentStr & genMemcpy(address(ast.vName), ctx.address(ast.vInit),
+                                       size(ast.vName))
+
+  of gpuAssign:
+    if ast.aRequiresMemcpy:
+      result = indentStr & genMemcpy(ctx.address(ast.aLeft), ctx.address(ast.aRight),
+                                     ctx.size(ast.aLeft))
+    else:
+      result = indentStr & ctx.genCuda(ast.aLeft) & " = " & ctx.genCuda(ast.aRight)
+
+  of gpuIf:
+    # skip semicolon in the condition. Otherwise can lead to problematic code
+    ctx.withoutSemicolon: # skip semicolon for if bodies
+      result = indentStr & "if (" & ctx.genCuda(ast.ifCond) & ") {\n"
+    result &= ctx.genCuda(ast.ifThen, indent + 1) & "\n"
+    result &= indentStr & "}"
+    if ast.ifElse.isSome:
+      result &= " else {\n"
+      result &= ctx.genCuda(ast.ifElse.get, indent + 1) & "\n"
+      result &= indentStr & "}"
+
+  of gpuFor:
+    result = indentStr & "for(int " & ast.fVar & " = " &
+             ctx.genCuda(ast.fStart) & "; " &
+             ast.fVar & " < " & ctx.genCuda(ast.fEnd) & "; " &
+             ast.fVar & "++) {\n"
+    result &= ctx.genCuda(ast.fBody, indent + 1) & "\n"
+    result &= indentStr & "}"
+  of gpuWhile:
+    ctx.withoutSemicolon:
+      result = indentStr & "while (" & ctx.genCuda(ast.wCond) & "){\n"
+    result &= ctx.genCuda(ast.wBody, indent + 1) & "\n"
+    result &= indentStr & "}"
+
+  of gpuDot:
+    result = ctx.genCuda(ast.dParent) & "." & ctx.genCuda(ast.dField)
+
+  of gpuIndex:
+    result = ctx.genCuda(ast.iArr) & "[" & ctx.genCuda(ast.iIndex) & "]"
+
+  of gpuCall:
+    result = indentStr & ast.cName & "(" &
+             ast.cArgs.mapIt(ctx.genCuda(it)).join(", ") & ")"
+
+  of gpuTemplateCall:
+    error("Template calls are not supported at the moment. In theory there shouldn't even _be_ any template " &
+      "calls in the expanded body of the `cuda` macro.")
+    when false: # Template replacement would look something like this:
+      let templ = ctx.templates[ast.tcName]
+      let expandedBody = substituteTemplateArgs(
+        templ.body,
+        templ.params,
+        ast.tcArgs
+      )
+      result = ctx.genCuda(expandedBody, indent)
+
+  of gpuBinOp:
+    result = indentStr & "(" & ctx.genCuda(ast.bLeft) & " " &
+             ast.bOp & " " &
+             ctx.genCuda(ast.bRight) & ")"
+
+  of gpuIdent:
+    result = ast.iName
+
+  of gpuLit:
+    if ast.lType.kind == gtString: result = "\"" & ast.lValue & "\""
+    elif ast.lValue == "DEFAULT": result = "{}" # default initialization, `DEFAULT` placeholder
+    else: result = ast.lValue
+
+  of gpuArrayLit:
+    result = "{"
+    for i, el in ast.aValues:
+      result.add "(" & gpuTypeToString(ast.aLitType) & ")" & el
+      if i < ast.aValues.high:
+        result.add ", "
+    result.add "}"
+
+  of gpuReturn:
+    result = indentStr & "return " & ctx.genCuda(ast.rValue)
+
+  of gpuPrefix:
+    result = ast.pOp & ctx.genCuda(ast.pVal)
+
+  of gpuTypeDef:
+    result = "struct " & ast.tName & "{\n"
+    for el in ast.tFields:
+      result.add "  " & gpuTypeToString(el.typ, el.name) & ";\n"
+    result.add "}"
+
+  of gpuObjConstr:
+    result = "{"
+    for i, el in ast.ocFields:
+      result.add ctx.genCuda(el.value)
+      if i < ast.ocFields.len - 1:
+        result.add ", "
+    result.add "}"
+
+  of gpuInlineAsm:
+    result = indentStr & "asm(" & ast.stmt.strip & ");"
+
+  of gpuComment:
+    result = indentStr & "/* " & ast.comment & " */"
+
+  of gpuCast:
+    result = "(" & gpuTypeToString(ast.cTo, allowEmptyIdent = true) & ")" & ctx.genCuda(ast.cExpr)
+
+  of gpuAddr:
+    result = "(&" & ctx.genCuda(ast.aOf) & ")"
+
+  of gpuDeref:
+    result = "(*" & ctx.genCuda(ast.dOf) & ")"
+
+  of gpuConstexpr:
+    if ast.cType.kind == gtArray:
+      result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, ctx.genCuda(ast.cIdent)) & " = " & ctx.genCuda(ast.cValue)
+    else:
+      result = indentStr & "__constant__ " & gpuTypeToString(ast.cType, allowEmptyIdent = true) & " " & ctx.genCuda(ast.cIdent) & " = " & ctx.genCuda(ast.cValue)
+
+  else:
+    echo "Unhandled node kind in genCuda: ", ast.kind
+    raiseAssert "Unhandled node kind in genCuda: " & ast.repr
+    result = ""
+
+macro cuda*(body: typed): string =
+  ## WARNING: The following are *not* supported:
+  ## - UFCS: because this is a pure untyped DSL, there is no way to disambiguate between
+  ##         what is a field access and a function call. Hence we assume any `nnkDotExpr`
+  ##         is actually a field access!
+  ## - most regular Nim features :)
+  var ctx = GpuContext()
+  let gpuAst = ctx.toGpuAst(body)
+  ## NOTE: `header` is currently unused. Not sure yet if we'll ever need it.
+  let header = """
+// #include "foo.h"
+"""
+
+  let body = ctx.genCuda(gpuAst)
+  result = newLit(header & body)
+
+when isMainModule:
+  # Mini example
+  let kernel = cuda:
+    proc square(x: float32): float32 {.device.} =
+      if x < 0.0'f32:
+        result = 0.0'f32
+      else:
+        result = x * x
+
+    proc computeSquares(
+      output: ptr float32,
+      input: ptr float32,
+      n: int32
+    ) {.global.} =
+      let idx: uint32 = blockIdx.x * blockDim.x + threadIdx.x
+      if idx < n:
+        var temp: float32 = 0.0'f32
+        for i in 0..<4:
+          temp += square(input[idx + i * n])
+        output[idx] = temp
+
+  echo kernel
diff --git a/constantine/math_compiler/experimental/nvrtc_field_ops.nim b/constantine/math_compiler/experimental/nvrtc_field_ops.nim
new file mode 100644
index 000000000..b13744cb5
--- /dev/null
+++ b/constantine/math_compiler/experimental/nvrtc_field_ops.nim
@@ -0,0 +1,617 @@
+import std / strformat
+
+import
+  # Internal
+  constantine/named/algebras,
+  constantine/math/io/[io_bigints, io_fields],
+  constantine/math/arithmetic,
+  constantine/platforms/abstractions {.all.},
+  constantine/platforms/abis/nvidia_abi,
+  constantine/math_compiler/experimental/runtime_compile,
+  constantine/serialization/io_limbs,
+  constantine/named/deriv/precompute
+
+import constantine/platforms/abstractions
+export negInvModWord
+
+import std / macros
+macro asm_comment*(msg: typed): untyped =
+  var msgLit = nnkTripleStrLit.newNimNode()
+  msgLit.strVal = "\"// " & msg.strVal & "\""
+  result = nnkAsmStmt.newTree(newEmptyNode(), msgLit)
+
+template bigintToUint32Limbs*(b: typed): untyped =
+  let limbs = b.limbs
+  when CTT_32:
+    var res = default(array[b.limbs.len, uint32])
+    for i in 0 ..< limbs.len:
+      res[i] = limbs[i].uint32
+  else:
+    {.error: "Logic to convert 64 bit limbs to 32 bit limbs at compile time still unfinished.".}
+    # need twice as many limbs to go from 64bit to 32bit
+    ## XXX: Use number of bits required to check if the
+    ## last limbs needs to be dropped
+    var res = default(array[b.limbs.len * 2, uint32])
+    for i in 0 ..< b.limbs.len:
+      res[i*2]     = limbs[i].uint32
+      res[i*2 + 1] = (limbs[i] shr 32).uint32
+  res
+
+template defBigInt*(N: typed): untyped {.dirty.} =
+  # Utility for add with carry operations
+  type
+    BigInt = object
+      limbs: array[N, uint32]
+  template `[]`(x: BigInt, idx: int): untyped = x.limbs[idx]
+  template `[]=`(x: BigInt, idx: int, val: uint32): untyped = x.limbs[idx] = val
+  template `[]`(x: ptr BigInt, idx: int): untyped = x[].limbs[idx]
+  template `[]=`(x: ptr BigInt, idx: int, val: uint32): untyped = x[].limbs[idx] = val
+
+  template len(x: BigInt): int = N
+
+template defPtxHelpers*(): untyped {.dirty.} =
+  ## Note: the below would just be generated from a macro of course, similar to
+  ## `constantine/platforms/llvm/asm_nvidia.nim`.
+
+  ## IMPORTANT NOTE: For the below procs that define inline PTX statements:
+  ## It is very important (in the current implementation) that each of the
+  ## return values is marked `{.volatile.}` so that the NVRTC compiler does not
+  ## eliminate any of the function calls. Despite them being `__forceinline__`,
+  ## it might do such a thing if the return value is not used.
+
+  proc add_cio(a, b: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"addc.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc add_ci(a, b: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"addc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc add_co(a, b: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"add.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc sub_bo(a, b: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"sub.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc sub_bi(a, b: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"subc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc sub_bio(a, b: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"subc.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc slct(a, b: uint32, pred: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+# "slct.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(pred)
+    asm """
+"slct.u32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(pred)
+"""
+    return res
+
+  proc mul_lo(a, b: uint32): uint32 {.device, forceinline.} =
+    ## Returns the lower 32 bit of the uint32 multiplication, i.e.
+    ## behaves as unsigned multiplication modulo 2^32 (matches LLVM `mul`).
+    var res {.volatile.}: uint32
+    asm """
+"mul.lo.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc mul_hi(a, b: uint32): uint32 {.device, forceinline.} =
+    ## Returns the upper 32 bit of the uint32 multiplication
+    var res {.volatile.}: uint32
+    asm """
+"mul.hi.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+
+  # r <- a * b + c
+  proc mulloadd(a, b, c: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"mad.lo.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c)
+"""
+    return res
+
+  proc mulloadd_co(a, b, c: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c)
+"""
+    return res
+
+  proc mulloadd_ci(a, b, c: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"madc.lo.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c)
+"""
+    return res
+
+  proc mulloadd_cio(a, b, c: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c)
+"""
+    return res
+
+  # r <- (a * b) >> 32 + c
+  # r <- (a * b) >> 64 + c
+  proc mulhiadd(a, b, c: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"mad.hi.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c)
+"""
+    return res
+
+  proc mulhiadd_co(a, b, c: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"mad.hi.cc.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c)
+"""
+    return res
+
+  proc mulhiadd_ci(a, b, c: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"madc.hi.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c)
+"""
+    return res
+
+  proc mulhiadd_cio(a, b, c: uint32): uint32 {.device, forceinline.} =
+    var res {.volatile.}: uint32
+    asm """
+"madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(c)
+"""
+    return res
+
+template defCoreFieldOps*(T: typed): untyped {.dirty.} =
+  # Need to get the limbs & spare bits data in a static context
+  template getM0ninv(): untyped = static: T.getModulus().negInvModWord().uint32
+  template spareBits(): untyped = static: (BigInt().limbs.len * WordSize - T.bits())
+
+  ## TODO: avoid the explicit array size here
+  proc toBigInt(limbs: array[1, uint32]): BigInt {.nimonly.} =
+    result.limbs = limbs
+
+  const M = toBigInt(bigintToUint32Limbs(T.getModulus))
+  const MontyOne = toBigInt(bigintToUint32Limbs(T.getMontyOne))
+  const PP1D2 = toBigInt(bigintToUint32Limbs(T.getPrimePlus1div2))
+  const M0NInv = getM0ninv()
+
+  proc finalSubMayOverflow(a, M: BigInt): BigInt {.device.} =
+    ## If a >= Modulus: r <- a-M
+    ## else:            r <- a
+    ##
+    ## This is constant-time straightline code.
+    ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
+    ##
+    ## To be used when the final substraction can
+    ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
+    var scratch: BigInt = BigInt()
+
+    # Contains 0x0001 (if overflowed limbs) or 0x0000
+    let overflowedLimbs = add_ci(0'u32, 0'u32)
+
+    # Now substract the modulus, and test a < M with the last borrow
+    scratch[0] = sub_bo(a[0], M[0])
+    staticFor i, 1, N:
+      scratch[i] = sub_bio(a[i], M[i])
+
+    # 1. if `overflowedLimbs`, underflowedModulus >= 0
+    # 2. if a >= M, underflowedModulus >= 0
+    # if underflowedModulus >= 0: a-M else: a
+    # TODO: predicated mov instead?
+    ## TODO: Fix this. `slct` needs a negative value for the else branch
+    let underflowedModulus = sub_bi(overflowedLimbs, 0'u32)
+
+    var r: BigInt = BigInt()
+    staticFor i, 0, N:
+      r[i] = slct(scratch[i], a[i], underflowedModulus.int32)
+    return r
+
+  proc finalSubNoOverflow(a, M: BigInt): BigInt {.device.} =
+    ## If a >= Modulus: r <- a-M
+    ## else:            r <- a
+    ##
+    ## This is constant-time straightline code.
+    ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
+    ##
+    ## To be used when the modulus does not use the full bitwidth of the storing words
+    ## (say using 255 bits for the modulus out of 256 available in words)
+    var scratch: BigInt = BigInt()
+
+    # Now substract the modulus, and test a < M with the last borrow
+    scratch[0] = sub_bo(a[0], M[0])
+    staticFor i, 1, N:
+      scratch[i] = sub_bio(a[i], M[i])
+
+    # If it underflows here, `a` was smaller than the modulus, which is what we want
+    ## TODO: Fix this. `slct` needs a negative value for the else branch
+    let underflowedModulus = sub_bi(0'u32, 0'u32)
+
+    var r: BigInt = BigInt()
+    staticFor i, 0, N:
+      r[i] = slct(scratch[i], a[i], underflowedModulus.int32)
+    return r
+
+  proc modadd(a, b, M: BigInt): BigInt {.device.} =
+    ## Generate an optimized modular addition kernel
+    ## with parameters `a, b, modulus: Limbs -> Limbs`
+    # try to add two bigints
+    var t = BigInt() # temporary
+
+    t[0] = add_co(a[0], b[0])
+    staticFor i, 1, N:
+      t[i] = add_cio(a[i], b[i])
+
+    # can use `when` of course!
+    when spareBits() >= 1: # if spareBits() >= 1: # would also work
+      t = finalSubNoOverflow(t, M)
+    else:
+      t = finalSubMayOverflow(t, M)
+
+    return t
+
+  proc modsub(a, b, M: BigInt): BigInt {.device.} =
+    ## Generate an optimized modular substraction kernel
+    ## with parameters `a, b, modulus: Limbs -> Limbs`
+    var t = BigInt()
+
+    t[0] = sub_bo(a[0], b[0])
+    staticFor i, 1, a.len:
+      t[i] = sub_bio(a[i], b[i])
+
+    let underflowMask = sub_bi(0'u32, 0'u32)
+
+    # If underflow
+    # TODO: predicated mov instead?
+    var maskedM: BigInt = BigInt()
+    staticFor i, 0, N:
+      maskedM[i] = M[i] and underflowMask
+
+    t[0] = add_co(t[0], maskedM[0])
+    staticFor i, 1, a.len-1:
+      t[i] = add_cio(t[i], maskedM[i])
+    when N > 1:
+      t[N-1] = add_ci(t[N-1], maskedM[N-1])
+
+    return t
+
+  proc mtymul_CIOS_sparebit(a, b, M: BigInt, finalReduce: bool): BigInt {.device.} =
+    ## Generate an optimized modular multiplication kernel
+    ## with parameters `a, b, modulus: Limbs -> Limbs`
+    var t: BigInt = BigInt()
+    template m0ninv: untyped = M0NInv
+
+    # Algorithm
+    # -----------------------------------------
+    #
+    # On x86, with a single carry chain and a spare bit:
+    #
+    # for i=0 to N-1
+    #   (A, t[0]) <- a[0] * b[i] + t[0]
+    #    m        <- (t[0] * m0ninv) mod 2ʷ
+    #   (C, _)    <- m * M[0] + t[0]
+    #   for j=1 to N-1
+    #     (A, t[j])   <- a[j] * b[i] + A + t[j]
+    #     (C, t[j-1]) <- m * M[j] + C + t[j]
+    #
+    #   t[N-1] = C + A
+    #
+    # with MULX, ADCX, ADOX dual carry chains
+    #
+    # for i=0 to N-1
+    #   for j=0 to N-1
+    # 		(A,t[j])  := t[j] + a[j]*b[i] + A
+    #   m := t[0]*m0ninv mod W
+    # 	C,_ := t[0] + m*M[0]
+    # 	for j=1 to N-1
+    # 		(C,t[j-1]) := t[j] + m*M[j] + C
+    #   t[N-1] = C + A
+    #
+    # In our case, we only have a single carry flag
+    # but we have a lot of registers
+    # and a multiply-accumulate instruction
+    #
+    # Hence we can use the dual carry chain approach
+    # one chain after the other instead of interleaved like on x86.
+
+    when N > 1:
+      staticFor i, 0, N:
+        # Multiplication
+        # -------------------------------
+        #   for j=0 to N-1
+        # 		(A,t[j])  := t[j] + a[j]*b[i] + A
+        #
+        # for 4 limbs, implicit column-wise carries
+        #
+        # t[0]     = t[0] + (a[0]*b[i]).lo
+        # t[1]     = t[1] + (a[1]*b[i]).lo + (a[0]*b[i]).hi
+        # t[2]     = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi
+        # t[3]     = t[3] + (a[3]*b[i]).lo + (a[2]*b[i]).hi
+        # overflow =                         (a[3]*b[i]).hi
+        #
+        # or
+        #
+        # t[0]     = t[0] + (a[0]*b[i]).lo
+        # t[1]     = t[1] + (a[0]*b[i]).hi + (a[1]*b[i]).lo
+        # t[2]     = t[2] + (a[2]*b[i]).lo + (a[1]*b[i]).hi
+        # t[3]     = t[3] + (a[2]*b[i]).hi + (a[3]*b[i]).lo
+        # overflow =    carry              + (a[3]*b[i]).hi
+        #
+        # Depending if we chain lo/hi or even/odd
+        # The even/odd carry chain is more likely to be optimized via μops-fusion
+        # as it's common to compute the full product. That said:
+        # - it's annoying if the number of limbs is odd with edge conditions.
+        # - GPUs are RISC architectures and unlikely to have clever instruction rescheduling logic
+        let bi = b[i]
+        var A = 0'u32
+
+        if i == 0:
+          staticFor j, 0, N:
+            t[j] = mul_lo(a[j], bi)
+        else:
+          t[0] = mulloadd_co(a[0], bi, t[0])
+          staticFor j, 1, N:
+            t[j] = mulloadd_cio(a[j], bi, t[j])
+          A = add_ci(0'u32, 0'u32)          # assumes N > 1
+        t[1] = mulhiadd_co(a[0], bi, t[1])  # assumes N > 1
+        staticFor j, 2, N:
+          t[j] = mulhiadd_cio(a[j-1], bi, t[j])
+        A = mulhiadd_ci(a[N-1], bi, A)
+        # Reduction
+        # -------------------------------
+        #   m := t[0]*m0ninv mod W
+        #
+        # 	C,_ := t[0] + m*M[0]
+        # 	for j=1 to N-1
+        # 		(C,t[j-1]) := t[j] + m*M[j] + C
+        #   t[N-1] = C + A
+        #
+        # for 4 limbs, implicit column-wise carries
+        #    _  = t[0] + (m*M[0]).lo
+        #  t[0] = t[1] + (m*M[1]).lo + (m*M[0]).hi
+        #  t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi
+        #  t[2] = t[3] + (m*M[3]).lo + (m*M[2]).hi
+        #  t[3] = A + carry          + (m*M[3]).hi
+        #
+        # or
+        #
+        #    _  = t[0] + (m*M[0]).lo
+        #  t[0] = t[1] + (m*M[0]).hi + (m*M[1]).lo
+        #  t[1] = t[2] + (m*M[2]).lo + (m*M[1]).hi
+        #  t[2] = t[3] + (m*M[2]).hi + (m*M[3]).lo
+        #  t[3] = A + carry          + (m*M[3]).hi
+
+        let m = mul_lo(t[0], m0ninv)
+        let _ = mulloadd_co(m, M[0], t[0])
+        staticFor j, 1, N:
+          t[j-1] = mulloadd_cio(m, M[j], t[j])
+        t[N-1] = add_ci(A, 0)
+        # assumes N > 1
+        t[0] = mulhiadd_co(m, M[0], t[0])
+        staticFor j, 1, N-1:
+          t[j] = mulhiadd_cio(m, M[j], t[j])
+        t[N-1] = mulhiadd_ci(m, M[N-1], t[N-1])
+    else: # single limb, e.g. BabyBear (N=1)
+      # 1. Compute t = a * b (low and high, emulates lagged code in N limb branch)
+      # 2. Compute m = t * m0ninv mod 2^32
+      # 3. Compute t = (t + m*M) >> 32
+
+      # Step 1: t = a * b
+      let t0 = mul_lo(a[0], b[0]) # lower 32 bit
+      let t1 = mul_hi(a[0], b[0]) # upper 32 bit
+
+      # Step 2: m = t * m0ninv mod 2^32
+      let m = mul_lo(t0, m0ninv)
+
+      # Step 3: t = (t + m*M) >> 32
+      let _ = mulloadd_co(m, M[0], t0) # Low word discarded, but calc for possible carry
+      t[0] = mulhiadd_ci(m, M[0], t1)
+
+    if finalReduce:
+      t = finalSubNoOverflow(t, M)
+
+    return t
+  ##  More general field operations
+
+  proc setZero(a: var BigInt) {.device.} =
+    ## Sets all limbs of the field element to zero in place
+    # Zero all limbs
+    for i in 0 ..< N:
+      a[i] = 0'u32
+
+  proc setOne(a: var BigInt) {.device.} =
+    ## Sets the field element to one in Montgomery form
+    ## For a field element to be valid in Montgomery form,
+    ## we need x · R mod M with R = 2^(WordBitWidth * numWords)
+    template montyOne: untyped = MontyOne # Get the Montgomery form of 1 from static context
+    # Copy the Montgomery form of 1
+    for i in 0 ..< N:
+      a[i] = montyOne[i] # .uint32
+
+  proc add(r: var BigInt, a, b: BigInt) {.device.} =
+    ## Addition of two finite field elements stored in `a` and `b`.
+    ## The result is stored in `r`.
+    r = modadd(a, b, M)
+
+  proc sub(r: var BigInt, a, b: BigInt) {.device.} =
+    ## Subtraction of two finite field elements stored in `a` and `b`.
+    ## The result is stored in `r`.
+    r = modsub(a, b, M)
+
+  proc mul(r: var BigInt, a, b: BigInt) {.device.} =
+    ## Multiplication of two finite field elements stored in `a` and `b`.
+    ## The result is stored in `r`.
+    r = mtymul_CIOS_sparebit(a, b, M, true)
+
+  proc ccopy(a: var BigInt, b: BigInt, condition: bool) {.device.} =
+    ## Conditional copy in CUDA
+    ## If condition is true: b is copied into a
+    ## If condition is false: a is left unmodified
+    ##
+    ## Note: This is constant-time
+    # Use selp instruction for constant-time selection:
+    # if condition then b else a
+    ## XXX: add support for `IfExpr`! Requires though.
+    var cond: int32
+    if condition:
+      cond = 1'i32
+    else:
+      cond = -1'i32 # `slct` checks for `>= 0` as the true branch!
+    for i in 0 ..< N:
+      a[i] = slct(b[i], a[i], cond)
+
+  proc csetZero(r: var BigInt, condition: bool) {.device.} =
+    ## Conditionally set `r` to zero in CUDA
+    ##
+    ## Note: This is constant-time
+    var t = BigInt()
+    t.setZero()
+    r.ccopy(t, condition)
+
+  proc csetOne(r: var BigInt, condition: bool) {.device.} =
+    ## Conditionally set `r` to one in CUDA
+    ##
+    ## Note: This is constant-time
+    template mOne: untyped = MontyOne
+    r.ccopy(mOne, condition)
+
+  proc cadd(r: var BigInt, a: BigInt, condition: bool) {.device.} =
+    ## Conditionally add `a` to `r` in place in CUDA.
+    ##
+    ## Note: This is constant-time
+    var t = BigInt()
+    t.add(r, a)
+    r.ccopy(t, condition)
+
+  proc csub(r: var BigInt, a: BigInt, condition: bool) {.device.} =
+    ## Conditionally subtract `a` from `r` in place in CUDA.
+    ##
+    ## Note: This is constant-time
+    var t = BigInt()
+    t.sub(r, a)
+    r.ccopy(t, condition)
+
+  proc doubleElement(r: var BigInt, a: BigInt) {.device.} =
+    ## Double `a` and store it in `r` in CUDA.
+    ##
+    ## Note: This is constant-time
+    r.add(a, a)
+
+  proc nsqr(r: var BigInt, a: BigInt, count: int) {.device.} =
+    ## Performs `nsqr`, that is multiple squarings of `a` and stores it in `r`
+    ## in CUDA.
+    ##
+    ## Note: This is constant-time
+    ##
+    ## TODO: Add a `skipFinalSub` argument?
+    r = a # copy over a
+    for i in 0 ..< count-1:
+      r = mtymul_CIOS_sparebit(r, r, M, finalReduce = false)
+    # last one with reducing
+    r = mtymul_CIOS_sparebit(r, r, M, finalReduce = true)
+
+  proc isZero(r: var bool, a: BigInt) {.device.} =
+    ## Checks if `a` is zero in CUDA. Result is written to `r`.
+    ##
+    ## Note: This is constant-time
+    #r = true
+    #staticFor i, 0, a.len:
+    #  r = r and a[i] == 0'u32
+    var isZero = a[0]
+    staticFor i, 0, a.len:
+      isZero = isZero or a[i]
+    r = isZero == 0'u32
+
+  proc isOdd(r: var bool, a: BigInt) {.device.} =
+    ## Checks if the Montgomery value of `a` is odd in CUDA. Result is written to `r`.
+    ##
+    ## IMPORTANT: The canonical value may or may not be odd if the Montgomery
+    ## representation is odd (and vice versa!).
+    ##
+    ## Note: This is constant-time
+    # check if least significant byte has first bit set
+    r = (a[0] and 1'u32).bool
+
+  proc neg(r: var BigInt, a: BigInt) {.device.} =
+    ## Computes the negation of `a` and stores it in `r` in CUDA.
+    ##
+    ## Note: This is constant-time
+    # Check if input is zero
+    var isZ: bool = false
+    isZ.isZero(a)
+    # Subtraction `M - a`
+    var t = BigInt()
+    ## XXX: Is it safe to use `modsub` here?
+    t.sub(M, a)
+    # If input zero, we want `r = 0` instead of `r = M`!
+    t.csetZero(isZ)
+    r = t
+
+  proc cneg(r: var BigInt, a: BigInt, condition: bool) {.device.} =
+    ## Conditionally negate `a` and store it in `r` if `condition` is true, otherwise
+    ## copy over `a` into `r` in CUDA.
+    ##
+    ## Note: This is constant-time
+    r.neg(a)
+    r.ccopy(a, not condition)
+
+  proc shiftRight(r: var BigInt, k: uint32) {.device.} =
+    ## Shift `r` right by `k` bits in-nplace in CUDA.
+    ##
+    ## k MUST be less than the base word size (2^31)
+    ##
+    ## Note: This is constant-time
+    let wordBitWidth = sizeof(uint32) * 8
+    let shiftLeft = wordBitWidth.uint32 - k
+
+    # process all but the last word
+    staticFor i, 0, r.len - 1:
+      let current = r[i]
+      let next = r[i + 1]
+
+      let rightPart = current shr k
+      let leftPart = next shl shiftLeft
+      r[i] = rightPart or leftPart
+
+    # handle the last word
+    let lastIdx = r.len - 1
+    r[lastIdx] = r[lastIdx] shr k
+
+  proc div2(r: var BigInt) {.device.} =
+    ## Divide `r` by 2 in-place in CUDA.
+    ##
+    ## Note: This is constant-time
+    # check if the input is odd
+    var isO: bool = false
+    isO.isOdd(r)
+
+    # perform the division using a right shift
+    r.shiftRight(1)
+
+    # if it was odd, add `M+1/2` to go 'half-way around'
+    r.cadd(PP1D2, isO)
+
diff --git a/constantine/math_compiler/experimental/runtime_compile.nim b/constantine/math_compiler/experimental/runtime_compile.nim
new file mode 100644
index 000000000..038c8f920
--- /dev/null
+++ b/constantine/math_compiler/experimental/runtime_compile.nim
@@ -0,0 +1,326 @@
+# Constantine
+# Copyright (c) 2018-2019    Status Research & Development GmbH
+# Copyright (c) 2020-Present Mamy André-Ratsimbazafy
+# Licensed and distributed under either of
+#   * MIT license (license terms in the root directory or at http://opensource.org/licenses/MIT).
+#   * Apache v2 license (license terms in the root directory or at http://www.apache.org/licenses/LICENSE-2.0).
+# at your option. This file may not be copied, modified, or distributed except according to those terms.
+
+import
+  std / [strformat, strutils]
+
+import constantine/platforms/abis/nvidia_abi
+
+import ./nim_ast_to_cuda_ast
+import ./cuda_execute_dsl
+export cuda_execute_dsl
+export nim_ast_to_cuda_ast
+
+## Set to true, if you want some extra output (driver & runtime version for example)
+const DebugCuda {.booldefine.} = true
+
+## Dummy data for the typed nature of the `cuda` macro. These define commonly used
+## CUDA specific names so that they produce valid Nim code in the context of a typed macro.
+template global*() {.pragma.}
+template device*() {.pragma.}
+template forceinline*() {.pragma.}
+
+# If attached to a `var` it will be treated as a
+# `__constant__`! Only useful if you want to define a
+# constant without initializing it (and then use
+# `cudaMemcpyToSymbol` / `copyToSymbol` to initialize it
+# before executing the kernel)
+template constant*() {.pragma.}
+type
+  Dim* = cint ## dummy to have access to math
+  NvBlockIdx* = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+  NvBlockDim = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+  NvThreadIdx* = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+  NvGridDim = object
+    x*: Dim
+    y*: Dim
+    z*: Dim
+
+
+## These are dummy elements to make CUDA block / thread index / dim
+## access possible in the *typed* `cuda` macro. It cannot be `const`,
+## because then the typed code would evaluate the values before we
+## can work with it from the typed macro.
+let blockIdx* = NvBlockIdx()
+let blockDim* = NvBlockDim()
+let gridDim* = NvGridDim()
+let threadIdx* = NvThreadIdx()
+
+## Similar for procs. They don't need any implementation, as they won't ever be actually called.
+proc printf*(fmt: string) {.varargs.} = discard
+proc memcpy*(dst, src: pointer, size: int) = discard
+
+## `cuExtern` is mapped to `extern`, but has a different name, because Nim has its
+## own `extern` pragma (due to requiring an argument it cannot be reused):
+## https://nim-lang.org/docs/manual.html#foreign-function-interface-extern-pragma
+template cuExtern*(): untyped {.pragma.}
+template shared*(): untyped {.pragma.}
+## You would typically use `cuExtern` and `shared` together:
+## `var x {.cuExtern, shared.}: array[N, Foo]`
+## for example to declare a constant array that is filled by the
+## host before kernel execution.
+
+## While you can use `malloc` on device with small sizes, it is usually not
+## recommended to do so.
+proc malloc*(size: csize_t): pointer  = discard
+proc free*(p: pointer) = discard
+proc syncthreads*() {.cudaName: "__syncthreads".} = discard
+
+
+
+type
+  NVRTC* = object
+    numBlocks* = 32 # number of blocks to launch
+    threadsPerBlock* = 128 # number of threads for each block. Total threads: `numBlocks * threadsPerBlock`
+    name*: string # Name of the program (of the generated in memory CUDA file)
+    prog*: nvrtcProgram
+    log*: string # The compilation log
+    ptx*: string # PTX of the program
+    cubin*: pointer
+    cubinSize*: csize_t
+    device*: CUdevice
+    kernel*: CUfunction
+    module*: CUmodule
+    context*: CUcontext
+    moduleLoaded*: bool
+
+proc `=destroy`(nvrtc: NVRTC) =
+  if nvrtc.module.pointer != nil:
+    check cuModuleUnload nvrtc.module
+  if nvrtc.context.pointer != nil:
+    check cuCtxDestroy nvrtc.context
+
+proc initNvrtc*(cuda: string, name = "sample.cu"): NVRTC =
+  ## Initializes an NVRTC object for the given program `cuda`
+  when DebugCuda:
+    var x: cint
+    check cuDriverGetVersion(x.addr)
+    echo "Driver version: ", x
+
+    var rtVer: cint
+    echo cudaRuntimeGetVersion(addr rtVer)
+    echo "Runtime ver: ", rtVer
+
+    var prop: cudaDeviceProp
+    echo cudaGetDeviceProperties(addr prop, 0);
+    echo "Compute capability: ", prop.major, " ", prop.minor
+
+  var
+    context: CUcontext
+    device: CUdevice
+
+  check cuInit(0)
+  check cuDeviceGet(device, 0)
+  check cuCtxCreate(context, 0, device)
+
+  # Create an instance of nvrtcProgram based on the passed code
+  var prog: nvrtcProgram
+  check nvrtcCreateProgram(addr(prog), cstring cuda, cstring name, 0, nil, nil)
+
+  result = NVRTC(prog: prog, name: name,
+                 device: device,
+                 context: context)
+
+
+proc log*(nvrtc: var NVRTC) =
+  ## Retrieve the compilation log.
+  var logSize: csize_t
+  check nvrtcGetProgramLogSize(nvrtc.prog, addr logSize)
+
+  var log = cstring newString(Natural logSize)
+
+  check nvrtcGetProgramLog(nvrtc.prog, log)
+  nvrtc.log = $log # usually empty if no issues found by the compiler
+
+proc compile*(nvrtc: var NVRTC) =
+  # Compile the program with fmad disabled.
+  # Note: Can specify GPU target architecture explicitly with '-arch' flag.
+  const
+    Options = [
+      cstring "--gpu-architecture=compute_61", # or whatever your GPU arch is
+      # "--fmad=false", # and whatever other options for example
+    ]
+
+    NumberOfOptions = cint Options.len
+  let compileResult =  nvrtcCompileProgram(nvrtc.prog, NumberOfOptions,
+                                           cast[cstringArray](addr Options[0]))
+
+  nvrtc.log()
+  ## XXX: only in `DebugCuda`?
+  echo "Compilation log:\n------------------------------"
+  echo nvrtc.log
+  echo "------------------------------"
+  check compileResult
+
+proc getPtx*(nvrtc: var NVRTC) =
+  ## Obtain PTX from the program.
+  var ptxSize: csize_t
+  check nvrtcGetPTXSize(nvrtc.prog, addr ptxSize)
+
+  var ptx = newString(int ptxSize)
+  check nvrtcGetPTX(nvrtc.prog, ptx)
+
+  check nvrtcDestroyProgram(addr nvrtc.prog) # Destroy the program.
+  nvrtc.ptx = ptx
+
+  when DebugCuda:
+    echo "PTX size: ", ptxSize
+    #echo "-------------------- PTX --------------------\n", nvrtc.ptx
+    writeFile("/tmp/kernel.ptx", nvrtc.ptx)
+
+proc load*(nvrtc: var NVRTC) =
+  # After getting the PTX...
+  var error_log = newString(8192)
+  var info_log = newString(8192)
+
+  ## NOTE: if you wish to use the `link` approach, pass `nvrtc.cubin` instead of `PTX`
+  #let status = cuModuleLoadData(addr nvrtc.module, nvrtc.cubin)
+  let status = cuModuleLoadData(nvrtc.module, cstring nvrtc.ptx)
+  if status != CUDA_SUCCESS:
+    var error_str: cstring #const char* error_str;
+    check cuGetErrorString(status, cast[cstringArray](addr error_str));
+    echo "Module load failed: ", error_str
+    echo "JIT Error log: ", error_log
+    echo "JIT Info log: ", info_log
+    quit(1)
+
+  nvrtc.moduleLoaded = true
+
+proc link*(nvrtc: var NVRTC) =
+  ## OPTIONAL STEP. Alternative to passing the PTX to `cuModuleLoadData`.
+  # Create linker
+  var linkState: CUlinkState
+  var linkOptions: array[4, CUjit_option]
+  var linkOptionValues: array[4, pointer]
+  var errorLog = newString(4096)
+  var infoLog = newString(4096)
+  var walltime: float32
+
+  linkOptions[0] = CU_JIT_WALL_TIME
+  linkOptionValues[0] = addr walltime
+  linkOptions[1] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+  linkOptionValues[1] = cast[pointer](4096)
+  linkOptions[2] = CU_JIT_ERROR_LOG_BUFFER
+  linkOptionValues[2] = addr errorLog[0]
+  linkOptions[3] = CU_JIT_INFO_LOG_BUFFER
+  linkOptionValues[3] = addr infoLog[0]
+
+  check cuLinkCreate(3, addr linkOptions[0], addr linkOptionValues[0], addr linkState)
+
+  # Add PTX
+  var res = cuLinkAddData(linkState, CU_JIT_INPUT_PTX,
+                          cast[pointer](cstring nvrtc.ptx),
+                          csize_t nvrtc.ptx.len,
+                          "kernel.ptx", 0, nil, nil)
+
+  var status: CUresult
+  if res != CUDA_SUCCESS:
+    var error_str: cstring
+    #discard cuGetErrorString(res, addr error_str)
+    check cuGetErrorString(status, cast[cstringArray](addr error_str))
+    echo "Link add PTX failed: ", error_str
+    echo "Error log: ", errorLog
+    quit(1)
+
+  # Add the device runtime (provides printf support)
+  ## NOTE: Linking requires yout to pass the path to `libcudadevrt.a` at CT
+  res = cuLinkAddFile(linkState, CU_JIT_INPUT_LIBRARY,
+                      "/usr/local/cuda/targets/x86_64-linux/lib/libcudadevrt.a",  # Adjust path as needed
+                      0, nil, nil)
+  if res != CUDA_SUCCESS:
+    var error_str: cstring
+    check cuGetErrorString(status, cast[cstringArray](addr error_str));
+    echo "Link add device runtime failed: ", error_str
+    echo "Error log: ", errorLog
+    quit(1)
+
+  # Complete linking
+  var cubn: pointer
+  var cubinSize: csize_t
+  res = cuLinkComplete(linkState, cubn.addr, cubinSize.addr)
+  nvrtc.cubinSize = cubinSize
+  if res != CUDA_SUCCESS:
+    var error_str: cstring
+    check cuGetErrorString(status, cast[cstringArray](addr error_str));
+    echo "Link complete failed: ", error_str
+    echo "Error log: ", errorLog
+    quit(1)
+
+  when DebugCuda:
+    echo "[INFO]: Writing CUBIN data to file /tmp/test.cubin"
+    echo "Cubin size: ", cubinSize
+    var f = open("/tmp/test.cubin", fmWrite)
+    discard f.writeBuffer(cubn, cubinSize)
+    f.close()
+
+  # Assign the cubin
+  nvrtc.cubin = cubn
+
+proc copyToSymbol*[T](nvrtc: NVRTC, symbol: string, data: T, offset = 0) =
+  ## Copies `data` to the symbol in the current CUDA kernel.
+  ## There is absolutely type safety involved here. We only check that the amount of
+  ## data you wish to copy to the global matches the size of the global storage.
+  ## This function does help you with automatically copying `seq[T]` for example.
+  ##
+  ## `offset` is an optional offset of the number of bytes at the target we want
+  ## to copy to. Useful to copy only individual elements of a constant array for example.
+  ##
+  ## Say you define in a kernel:
+  ##
+  ## ```nim
+  ## let foo = cuda:
+  ##   var data {.constant.}: array[1024, uint32]
+  ## # ...
+  ## # later in host code after getting the kernel from the `nvrtc` object:
+  ## let data = calcSomeArray1024() # runtime calculation
+  ## copyToSymbols("data", # name of the variable in CUDA code
+  ##               data)
+  ## ```
+  var devPtr: CUdeviceptr
+  var size: csize_t
+  check cuModuleGetGlobal(devPtr, size.addr, nvrtc.module, symbol.cstring)
+  var totSize: int
+  var srcPtr: pointer
+  when T is seq: # copy len * sizeof(element)
+    doAssert data.len > 0, "Input data is empty!"
+    let elSize = sizeof(data[0])
+    totSize = data.len * sizeof(elSize)
+    srcPtr = data[0].addr
+
+  else:
+    # For now just copy by `sizeof`!
+    totSize = sizeof(data)
+    srcPtr = data.addr
+  doAssert totSize.csize_t == size, "Input data size does not match size of global to copy to: " & $totSize & " vs. " & $size
+  check cuMemcpyHtoD(devPtr, srcPtr, csize_t(totSize))
+
+template execute*(nvrtc: var NVRTC, fn: string, res, inputs: typed, sharedMemSize: typed) =
+  ## Load the generated PTX, get the target kernel `fn` and execute it with the `res` and `inputs`
+
+  if not nvrtc.moduleLoaded:
+    nvrtc.load()
+
+  check cuModuleGetFunction(nvrtc.kernel, nvrtc.module, fn)
+
+  # now execute the kernel
+  execCuda(nvrtc.kernel, nvrtc.numBlocks, nvrtc.threadsPerBlock, res, inputs, sharedMemSize)
+
+  # synchronize so that e.g. `printf` statements will be printed before we (possibly) quit
+  check cuCtxSynchronize() #
+
+template execute*(nvrtc: var NVRTC, fn: string, res, inputs: typed) =
+  nvrtc.execute(fn, res, inputs, 0)
diff --git a/constantine/named/config_fields_and_curves.nim b/constantine/named/config_fields_and_curves.nim
index 330d4bf5d..77d9072b4 100644
--- a/constantine/named/config_fields_and_curves.nim
+++ b/constantine/named/config_fields_and_curves.nim
@@ -65,6 +65,11 @@ declareCurves:
     testingCurve: true
     bitwidth: 16
     modulus: "0xFFEF" # 65519 in hex
+
+  curve BabyBear: # BabyBear field used in SNARKs and zkVMs
+    bitwidth: 31
+    modulus: "0x78000001" # `0b1111000000000000000000000000001 = 15·2²⁷ + 1 = 2³¹ - 2²⁷ + 1`
+
   curve Mersenne61:
     testingCurve: true
     bitwidth: 61
diff --git a/constantine/platforms/abis/libpaths.nim b/constantine/platforms/abis/libpaths.nim
new file mode 100644
index 000000000..54ddff076
--- /dev/null
+++ b/constantine/platforms/abis/libpaths.nim
@@ -0,0 +1,166 @@
+# This file is taken from `nimcuda`.
+
+##[This module implements some auto-detection of cuda installation locations,
+   as well as communication with the c compilers about this info.
+
+   If you want to manually overide the autodetection, pass the nim compiler
+   `-d:CudaLib="PATH_TO_CUDA_DYN_LIBS"` and/or
+   `-d:CudaIncludes="PATH_TO_CUDA_HEADERS"`.
+]##
+
+#[The following is a rip of std/distros, slightly modified for compile-time
+  use.
+  The extra specificity compared to normal `defined` tests or `hostOS`
+  is needed because some linux distros install cuda in very different places
+  (im looking at you, arch!)
+]#
+
+
+from std/distros import Distribution
+import std/[os, strutils, macros, macrocache]
+when NimMajor == 2:
+  import std/envvars
+
+
+# we cache the result of the 'cmdRelease'
+# execution for faster platform detections.
+var
+  unameRes {.compileTime.}: string
+  osReleaseIDRes {.compileTime.}: string
+  releaseRes {.compileTime.}: string
+  hostnamectlRes {.compileTime.}: string
+
+template cmdRelease(cmd, cache): untyped =
+  if cache.len == 0:
+    # cache = (when defined(nimscript): gorge(cmd) else: execProcess(cmd))
+    cache = gorge(cmd)
+  cache
+
+template uname(): untyped = cmdRelease("uname -a", unameRes)
+template osReleaseID(): untyped =
+  cmdRelease("cat /etc/os-release | grep ^ID=", osReleaseIDRes)
+template release(): untyped = cmdRelease("lsb_release -d", releaseRes)
+template hostnamectl(): untyped = cmdRelease("hostnamectl", hostnamectlRes)
+
+proc detectOsWithAllCmd(d: Distribution): bool {.compileTime.} =
+  let dd = toLowerAscii($d)
+  result = dd in toLowerAscii(osReleaseID()) or dd in toLowerAscii(release()) or
+            dd in toLowerAscii(uname()) or ("operating system: " & dd) in
+                toLowerAscii(hostnamectl())
+
+proc detectOsImpl(d: Distribution): bool {.compileTime.} =
+  case d
+  of Distribution.Windows: result = defined(windows)
+  of Distribution.Posix: result = defined(posix)
+  of Distribution.MacOSX: result = defined(macosx)
+  of Distribution.Linux: result = defined(linux)
+  of Distribution.BSD: result = defined(bsd)
+  else:
+    when defined(bsd):
+      case d
+      of Distribution.FreeBSD, Distribution.NetBSD, Distribution.OpenBSD:
+        result = $d in uname()
+      else:
+        result = false
+    elif defined(linux):
+      const EasyLinux = when (NimMajor, NimMinor) >= (1, 6):
+          {Distribution.Elementary, Distribution.Ubuntu, Distribution.Debian,
+          Distribution.Fedora, Distribution.OpenMandriva, Distribution.CentOS,
+          Distribution.Alpine, Distribution.Mageia, Distribution.Zorin,
+          Distribution.Void}
+        else:
+          {Distribution.Elementary, Distribution.Ubuntu, Distribution.Debian,
+          Distribution.Fedora, Distribution.OpenMandriva, Distribution.CentOS,
+          Distribution.Alpine, Distribution.Mageia, Distribution.Zorin}
+
+      case d
+      of Distribution.Gentoo:
+        result = ("-" & $d & " ") in uname()
+      of EasyLinux:
+        result = toLowerAscii($d) in osReleaseID()
+      of Distribution.RedHat:
+        result = "rhel" in osReleaseID()
+      of Distribution.ArchLinux:
+        result = "arch" in osReleaseID()
+      # when (NimMajor, NimMinor) >= (1, 6):
+      #   of Distribution.Artix:
+      #     result = "artix" in osReleaseID()
+      of Distribution.NixOS:
+        # Check if this is a Nix build or NixOS environment
+        result = existsEnv("NIX_BUILD_TOP") or
+          existsEnv("__NIXOS_SET_ENVIRONMENT_DONE")
+      of Distribution.OpenSUSE:
+        result = "suse" in toLowerAscii(uname()) or
+          "suse" in toLowerAscii(release())
+      of Distribution.GoboLinux:
+        result = "-Gobo " in uname()
+      of Distribution.Solaris:
+        let uname = toLowerAscii(uname())
+        result = ("sun" in uname) or ("solaris" in uname)
+      of Distribution.Haiku:
+        result = defined(haiku)
+      else:
+        result = detectOsWithAllCmd(d)
+    else:
+      result = false
+
+template detectOs(d: untyped): bool =
+  ## Distro/OS detection. For convenience, the
+  ## required `Distribution.` qualifier is added to the
+  ## enum value.
+  detectOsImpl(Distribution.d)
+
+
+
+# begin actual detection
+when detectOs(Windows):
+  from std/os import getEnv, `/`
+  const
+    CudaPath = getEnv("CUDA_PATH")
+    CudaIncludes* {.strdefine.} = CudaPath / "include"
+    CudaLib* {.strdefine.} = CudaPath / "lib64"
+
+elif detectOs(ArchLinux):
+  from std/os import `/`
+  const
+    CudaPath = "/opt/cuda"
+    CudaIncludes* {.strdefine.} = CudaPath / "include"
+    CudaLib* {.strdefine.} = CudaPath / "lib64"
+
+elif detectOs(Linux):
+  # Generic linux catch-all.
+  # This includes anyone following the cuda installation guide.
+  const
+    CudaPath = "/usr/local/cuda"
+    CudaIncludes* {.strdefine.} = CudaPath / "include"
+    CudaLib* {.strdefine.} = CudaPath / "lib64"
+
+else:
+  # Some wild operating system!
+  const
+    CudaIncludes* {.strdefine.} = "unknown"
+    CudaLib* {.strdefine.} = "unknown"
+
+
+# check for validity
+when not dirExists(CudaIncludes):
+  {.error: "Could not find the cuda source headers! Please specify the " &
+     "location of the cuda includes directory by passing " &
+     "`-d:CudaIncludes=\"YOUR_PATH\"` to the nim compiler.".}
+elif not dirExists(CudaLib):
+  {.error: "Could not find the cuda shared libraries! Please specify the " &
+     "location of the cuda library directory by passing " &
+     "`-d:CudaLib=\"YOUR_PATH\"` to the nim compiler.".}
+
+
+
+macro tellCompilerToUseCuda*(): untyped =
+  ## Tells the compiler and linker to use cuda libraries.
+  # we'll use macrocaching so that we dont unneccessarily emit a million times
+
+  const ToldCompilerCount = CacheCounter"ToldCompilerToUseCudaCount"
+  if ToldCompilerCount.value == 0:
+    result = quote do:
+      {.passC: "-I" & CudaIncludes.}
+      {.passL: "-L" & CudaLib & " -lcuda".}
+    inc ToldCompilerCount
diff --git a/constantine/platforms/abis/nvidia_abi.nim b/constantine/platforms/abis/nvidia_abi.nim
index 02dd8808f..f10c31071 100644
--- a/constantine/platforms/abis/nvidia_abi.nim
+++ b/constantine/platforms/abis/nvidia_abi.nim
@@ -20,9 +20,9 @@ import ./c_abi
 #
 # ############################################################
 
-const libPath = "/opt/cuda/lib64/" # For now, only support Linux
-static: echo "[Constantine] Will search Cuda runtime in $LD_LIBRARY_PATH and " & libPath & "libcuda.so"
-const libCuda = "(libcuda.so|" & libPath & "libcuda.so)"
+import ./libpaths
+tellCompilerToUseCuda()
+const libCuda = "(libcuda.so|libcuda.so)"
 
 # Cuda offers 2 APIs:
 # - cuda.h               the driver API
@@ -483,7 +483,331 @@ type
   CUstream* = distinct pointer
   CUdeviceptr* = distinct pointer
 
-{.push noconv, importc, dynlib: libCuda.}
+  CUlinkState_st = object
+  CUlinkState* = ptr CUlinkState_st
+
+######################################################################
+################################ cuda.h ##############################
+######################################################################
+
+type                          ##
+    ##  Max number of registers that a thread may use.\n
+    ##  Option type: unsigned int\n
+    ##  Applies to: compiler only
+    ##
+  CUjit_option* {.size: sizeof(cint).} = enum
+    CU_JIT_MAX_REGISTERS = 0, ##
+                           ##  IN: Specifies minimum number of threads per block to target compilation
+                           ##  for\n
+                           ##  OUT: Returns the number of threads the compiler actually targeted.
+                           ##  This restricts the resource utilization of the compiler (e.g. max
+                           ##  registers) such that a block with the given number of threads should be
+                           ##  able to launch based on register limitations. Note, this option does not
+                           ##  currently take into account any other resource limitations, such as
+                           ##  shared memory utilization.\n
+                           ##  Cannot be combined with ::CU_JIT_TARGET.\n
+                           ##  Option type: unsigned int\n
+                           ##  Applies to: compiler only
+                           ##
+    CU_JIT_THREADS_PER_BLOCK = 1, ##
+                               ##  Overwrites the option value with the total wall clock time, in
+                               ##  milliseconds, spent in the compiler and linker\n
+                               ##  Option type: float\n
+                               ##  Applies to: compiler and linker
+                               ##
+    CU_JIT_WALL_TIME = 2, ##
+                       ##  Pointer to a buffer in which to print any log messages
+                       ##  that are informational in nature (the buffer size is specified via
+                       ##  option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
+                       ##  Option type: char *\n
+                       ##  Applies to: compiler and linker
+                       ##
+    CU_JIT_INFO_LOG_BUFFER = 3, ##
+                             ##  IN: Log buffer size in bytes.  Log messages will be capped at this size
+                             ##  (including null terminator)\n
+                             ##  OUT: Amount of log buffer filled with messages\n
+                             ##  Option type: unsigned int\n
+                             ##  Applies to: compiler and linker
+                             ##
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES = 4, ##
+                                        ##  Pointer to a buffer in which to print any log messages that
+                                        ##  reflect errors (the buffer size is specified via option
+                                        ##  ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+                                        ##  Option type: char *\n
+                                        ##  Applies to: compiler and linker
+                                        ##
+    CU_JIT_ERROR_LOG_BUFFER = 5, ##
+                              ##  IN: Log buffer size in bytes.  Log messages will be capped at this size
+                              ##  (including null terminator)\n
+                              ##  OUT: Amount of log buffer filled with messages\n
+                              ##  Option type: unsigned int\n
+                              ##  Applies to: compiler and linker
+                              ##
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES = 6, ##
+                                         ##  Level of optimizations to apply to generated code (0 - 4), with 4
+                                         ##  being the default and highest level of optimizations.\n
+                                         ##  Option type: unsigned int\n
+                                         ##  Applies to: compiler only
+                                         ##
+    CU_JIT_OPTIMIZATION_LEVEL = 7, ##
+                                ##  No option value required. Determines the target based on the current
+                                ##  attached context (default)\n
+                                ##  Option type: No option value needed\n
+                                ##  Applies to: compiler and linker
+                                ##
+    CU_JIT_TARGET_FROM_CUCONTEXT = 8, ##
+                                   ##  Target is chosen based on supplied ::CUjit_target.  Cannot be
+                                   ##  combined with ::CU_JIT_THREADS_PER_BLOCK.\n
+                                   ##  Option type: unsigned int for enumerated type ::CUjit_target\n
+                                   ##  Applies to: compiler and linker
+                                   ##
+    CU_JIT_OPTION_TARGET = 9, ##
+                    ##  Specifies choice of fallback strategy if matching cubin is not found.
+                    ##  Choice is based on supplied ::CUjit_fallback.  This option cannot be
+                    ##  used with cuLink* APIs as the linker requires exact matches.\n
+                    ##  Option type: unsigned int for enumerated type ::CUjit_fallback\n
+                    ##  Applies to: compiler only
+                    ##
+    CU_JIT_FALLBACK_STRATEGY = 10, ##
+                                ##  Specifies whether to create debug information in output (-g)
+                                ##  (0: false, default)\n
+                                ##  Option type: int\n
+                                ##  Applies to: compiler and linker
+                                ##
+    CU_JIT_GENERATE_DEBUG_INFO = 11, ##
+                                  ##  Generate verbose log messages (0: false, default)\n
+                                  ##  Option type: int\n
+                                  ##  Applies to: compiler and linker
+                                  ##
+    CU_JIT_LOG_VERBOSE = 12, ##
+                          ##  Generate line number information (-lineinfo) (0: false, default)\n
+                          ##  Option type: int\n
+                          ##  Applies to: compiler only
+                          ##
+    CU_JIT_GENERATE_LINE_INFO = 13, ##
+                                 ##  Specifies whether to enable caching explicitly (-dlcm) \n
+                                 ##  Choice is based on supplied ::CUjit_cacheMode_enum.\n
+                                 ##  Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
+                                 ##  Applies to: compiler only
+                                 ##
+    CU_JIT_OPTION_CACHE_MODE = 14, ##
+                         ##  \deprecated
+                         ##  This jit option is deprecated and should not be used.
+                         ##
+    CU_JIT_NEW_SM3X_OPT = 15, ##
+                           ##  This jit option is used for internal purpose only.
+                           ##
+    CU_JIT_FAST_COMPILE = 16, ##
+                           ##  Array of device symbol names that will be relocated to the corresponding
+                           ##  host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
+                           ##  Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+                           ##  When loading a device module, driver will relocate all encountered
+                           ##  unresolved symbols to the host addresses.\n
+                           ##  It is only allowed to register symbols that correspond to unresolved
+                           ##  global variables.\n
+                           ##  It is illegal to register the same device symbol at multiple addresses.\n
+                           ##  Option type: const char **\n
+                           ##  Applies to: dynamic linker only
+                           ##
+    CU_JIT_GLOBAL_SYMBOL_NAMES = 17, ##
+                                  ##  Array of host addresses that will be used to relocate corresponding
+                                  ##  device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
+                                  ##  Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+                                  ##  Option type: void **\n
+                                  ##  Applies to: dynamic linker only
+                                  ##
+    CU_JIT_GLOBAL_SYMBOL_ADDRESSES = 18, ##
+                                      ##  Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
+                                      ##  ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
+                                      ##  Option type: unsigned int\n
+                                      ##  Applies to: dynamic linker only
+                                      ##
+    CU_JIT_GLOBAL_SYMBOL_COUNT = 19, ##
+                                  ##  \deprecated
+                                  ##  Enable link-time optimization (-dlto) for device code (Disabled by default).\n
+                                  ##  This option is not supported on 32-bit platforms.\n
+                                  ##  Option type: int\n
+                                  ##  Applies to: compiler and linker
+                                  ##
+                                  ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                                  ##
+    CU_JIT_LTO = 20, ##
+                  ##  \deprecated
+                  ##  Control single-precision denormals (-ftz) support (0: false, default).
+                  ##  1 : flushes denormal values to zero
+                  ##  0 : preserves denormal values
+                  ##  Option type: int\n
+                  ##  Applies to: link-time optimization specified with CU_JIT_LTO
+                  ##
+                  ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                  ##
+    CU_JIT_FTZ = 21, ##
+                  ##  \deprecated
+                  ##  Control single-precision floating-point division and reciprocals
+                  ##  (-prec-div) support (1: true, default).
+                  ##  1 : Enables the IEEE round-to-nearest mode
+                  ##  0 : Enables the fast approximation mode
+                  ##  Option type: int\n
+                  ##  Applies to: link-time optimization specified with CU_JIT_LTO
+                  ##
+                  ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                  ##
+    CU_JIT_PREC_DIV = 22, ##
+                       ##  \deprecated
+                       ##  Control single-precision floating-point square root
+                       ##  (-prec-sqrt) support (1: true, default).
+                       ##  1 : Enables the IEEE round-to-nearest mode
+                       ##  0 : Enables the fast approximation mode
+                       ##  Option type: int\n
+                       ##  Applies to: link-time optimization specified with CU_JIT_LTO
+                       ##
+                       ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                       ##
+    CU_JIT_PREC_SQRT = 23, ##
+                        ##  \deprecated
+                        ##  Enable/Disable the contraction of floating-point multiplies
+                        ##  and adds/subtracts into floating-point multiply-add (-fma)
+                        ##  operations (1: Enable, default; 0: Disable).
+                        ##  Option type: int\n
+                        ##  Applies to: link-time optimization specified with CU_JIT_LTO
+                        ##
+                        ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                        ##
+    CU_JIT_FMA = 24, ##
+                  ##  \deprecated
+                  ##  Array of kernel names that should be preserved at link time while others
+                  ##  can be removed.\n
+                  ##  Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries.\n
+                  ##  Note that kernel names can be mangled by the compiler in which case the
+                  ##  mangled name needs to be specified.\n
+                  ##  Wildcard "*" can be used to represent zero or more characters instead of
+                  ##  specifying the full or mangled name.\n
+                  ##  It is important to note that the wildcard "*" is also added implicitly.
+                  ##  For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and
+                  ##  thus preserve all kernels with those names. This can be avoided by providing
+                  ##  a more specific name like "barfoobaz".\n
+                  ##  Option type: const char **\n
+                  ##  Applies to: dynamic linker only
+                  ##
+                  ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                  ##
+    CU_JIT_REFERENCED_KERNEL_NAMES = 25, ##
+                                      ##  \deprecated
+                                      ##  Number of entries in ::CU_JIT_REFERENCED_KERNEL_NAMES array.\n
+                                      ##  Option type: unsigned int\n
+                                      ##  Applies to: dynamic linker only
+                                      ##
+                                      ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                                      ##
+    CU_JIT_REFERENCED_KERNEL_COUNT = 26, ##
+                                      ##  \deprecated
+                                      ##  Array of variable names (__device__ and/or __constant__) that should be
+                                      ##  preserved at link time while others can be removed.\n
+                                      ##  Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries.\n
+                                      ##  Note that variable names can be mangled by the compiler in which case the
+                                      ##  mangled name needs to be specified.\n
+                                      ##  Wildcard "*" can be used to represent zero or more characters instead of
+                                      ##  specifying the full or mangled name.\n
+                                      ##  It is important to note that the wildcard "*" is also added implicitly.
+                                      ##  For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and
+                                      ##  thus preserve all variables with those names. This can be avoided by providing
+                                      ##  a more specific name like "barfoobaz".\n
+                                      ##  Option type: const char **\n
+                                      ##  Applies to: link-time optimization specified with CU_JIT_LTO
+                                      ##
+                                      ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                                      ##
+    CU_JIT_REFERENCED_VARIABLE_NAMES = 27, ##
+                                        ##  \deprecated
+                                        ##  Number of entries in ::CU_JIT_REFERENCED_VARIABLE_NAMES array.\n
+                                        ##  Option type: unsigned int\n
+                                        ##  Applies to: link-time optimization specified with CU_JIT_LTO
+                                        ##
+                                        ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                                        ##
+    CU_JIT_REFERENCED_VARIABLE_COUNT = 28, ##
+                                        ##  \deprecated
+                                        ##  This option serves as a hint to enable the JIT compiler/linker
+                                        ##  to remove constant (__constant__) and device (__device__) variables
+                                        ##  unreferenced in device code (Disabled by default).\n
+                                        ##  Note that host references to constant and device variables using APIs like
+                                        ##  ::cuModuleGetGlobal() with this option specified may resultNotKeyWord in undefined behavior unless
+                                        ##  the variables are explicitly specified using ::CU_JIT_REFERENCED_VARIABLE_NAMES.\n
+                                        ##  Option type: int\n
+                                        ##  Applies to: link-time optimization specified with CU_JIT_LTO
+                                        ##
+                                        ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                                        ##
+    CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES = 29, ##
+                                               ##  Generate position independent code (0: false)\n
+                                               ##  Option type: int\n
+                                               ##  Applies to: compiler only
+                                               ##
+    CU_JIT_POSITION_INDEPENDENT_CODE = 30, ##
+                                        ##  This option hints to the JIT compiler the minimum number of CTAs from the
+                                        ##  kernel’s grid to be mapped to a SM. This option is ignored when used together
+                                        ##  with ::CU_JIT_MAX_REGISTERS or ::CU_JIT_THREADS_PER_BLOCK.
+                                        ##  Optimizations based on this option need ::CU_JIT_MAX_THREADS_PER_BLOCK to
+                                        ##  be specified as well. For kernels already using PTX directive .minnctapersm,
+                                        ##  this option will be ignored by default. Use ::CU_JIT_OVERRIDE_DIRECTIVE_VALUES
+                                        ##  to let this option take precedence over the PTX directive.
+                                        ##  Option type: unsigned int\n
+                                        ##  Applies to: compiler only
+                                        ##
+    CU_JIT_MIN_CTA_PER_SM = 31, ##
+                             ##  Maximum number threads in a thread block, computed as the product of
+                             ##  the maximum extent specifed for each dimension of the block. This limit
+                             ##  is guaranteed not to be exeeded in any invocation of the kernel. Exceeding
+                             ##  the the maximum number of threads results in runtime error or kernel launch
+                             ##  failure. For kernels already using PTX directive .maxntid, this option will
+                             ##  be ignored by default. Use ::CU_JIT_OVERRIDE_DIRECTIVE_VALUES to let this
+                             ##  option take precedence over the PTX directive.
+                             ##  Option type: int\n
+                             ##  Applies to: compiler only
+                             ##
+    CU_JIT_MAX_THREADS_PER_BLOCK = 32, ##
+                                    ##  This option lets the values specified using ::CU_JIT_MAX_REGISTERS,
+                                    ##  ::CU_JIT_THREADS_PER_BLOCK, ::CU_JIT_MAX_THREADS_PER_BLOCK and
+                                    ##  ::CU_JIT_MIN_CTA_PER_SM take precedence over any PTX directives.
+                                    ##  (0: Disable, default; 1: Enable)
+                                    ##  Option type: int\n
+                                    ##  Applies to: compiler only
+                                    ##
+    CU_JIT_OVERRIDE_DIRECTIVE_VALUES = 33, CU_JIT_NUM_OPTIONS
+
+type                          ##
+    ##  Compiled device-class-specific device code\n
+    ##  Applicable options: none
+    ##
+  CUjitInputType* {.size: sizeof(cint).} = enum
+    CU_JIT_INPUT_CUBIN = 0,     ##
+                         ##  PTX source code\n
+                         ##  Applicable options: PTX compiler options
+                         ##
+    CU_JIT_INPUT_PTX = 1, ##
+                       ##  Bundle of multiple cubins and/or PTX of some device code\n
+                       ##  Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+                       ##
+    CU_JIT_INPUT_FATBINARY = 2, ##
+                             ##  Host object with embedded device code\n
+                             ##  Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+                             ##
+    CU_JIT_INPUT_OBJECT = 3, ##
+                          ##  Archive of host objects with embedded device code\n
+                          ##  Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+                          ##
+    CU_JIT_INPUT_LIBRARY = 4, ##
+                           ##  \deprecated
+                           ##  High-level intermediate code for link-time optimization\n
+                           ##  Applicable options: NVVM compiler options, PTX compiler options
+                           ##
+                           ##  Only valid with LTO-IR compiled with toolkits prior to CUDA 12.0
+                           ##
+    CU_JIT_INPUT_NVVM = 5, CU_JIT_NUM_INPUT_TYPES = 6
+
+
+
+{.push noconv, importc, dynlib: libCuda, header: "cuda.h".}
 
 proc cuInit*(flags: uint32): CUresult
 
@@ -495,12 +819,15 @@ proc cuDeviceGetAttribute*(r: var int32, attrib: CUdevice_attribute, dev: CUdevi
 proc cuCtxCreate*(pctx: var CUcontext, flags: uint32, dev: CUdevice): CUresult
 proc cuCtxDestroy*(ctx: CUcontext): CUresult
 proc cuCtxSynchronize*(ctx: CUcontext): CUresult
+proc cuCtxSynchronize*(): CUresult
 
-proc cuModuleLoadData(module: var CUmodule, sourceCode: ptr char): CUresult {.used.}
 proc cuModuleUnload*(module: CUmodule): CUresult
 proc cuModuleGetFunction(kernel: var CUfunction, module: CUmodule, fnName: ptr char): CUresult {.used.}
+proc cuModuleLoadData*(module: var CUmodule; image: pointer): CUresult
+proc cuModuleGetFunction*(hfunc: var CUfunction; hmod: CUmodule; name: cstring): CUresult
+proc cuModuleGetGlobal*(dptr: var CUdeviceptr, bytes: ptr csize_t, hmod: CUmodule, name: cstring): CUresult
 
-proc cuLaunchKernel(
+proc cuLaunchKernel*(
        kernel: CUfunction,
        gridDimX, gridDimY, gridDimZ: uint32,
        blockDimX, blockDimY, blockDimZ: uint32,
@@ -516,4 +843,1026 @@ proc cuMemFree*(devptr: CUdeviceptr): CUresult
 proc cuMemcpyHtoD*(dst: CUdeviceptr, src: pointer, size: csize_t): CUresult
 proc cuMemcpyDtoH*(dst: pointer, src: CUdeviceptr, size: csize_t): CUresult
 
-{.pop.} # {.push noconv, importc, dynlib: "libcuda.so".}
+proc cuDriverGetVersion*(driverVersion: ptr cint): CUresult
+
+proc cuLinkCreate*(numOptions: cuint; options: ptr CUjit_option;
+                   optionValues: ptr pointer; stateOut: ptr CUlinkState): CUresult
+proc cuLinkAddData*(state: CUlinkState; `type`: CUjitInputType; data: pointer;
+                   size: csize_t; name: cstring; numOptions: cuint;
+                   options: ptr CUjit_option; optionValues: ptr pointer): CUresult
+proc cuLinkComplete*(state: CUlinkState; cubinOut: ptr pointer; sizeOut: ptr csize_t): CUresult
+
+
+proc cuGetErrorString*(error: CUresult; pStr: cstringArray): CUresult
+
+proc cuLinkAddFile*(state: CUlinkState; `type`: CUjitInputType; path: cstring;
+                    numOptions: cuint; options: ptr CUjit_option;
+                    optionValues: ptr pointer): CUresult
+
+
+
+
+{.pop.} # {.push noconv, importc, dynlib: "libcuda.so", header: "cuda.h"..}
+
+
+######################################################################
+################################ nvrtc.h #############################
+######################################################################
+
+
+when defined(windows):
+  const
+    libNvrtc = "nvrtc64.dll"
+elif defined(macosx):
+  const
+    libNvrtc = "libnvrtc.dylib"
+else:
+  const
+    libNvrtc = "libnvrtc.so"
+
+
+type
+  nvrtcProgramObj {.noDecl, incompleteStruct.} = object
+  nvrtcProgram* = ptr nvrtcProgramObj
+
+  nvrtcResult* {.size: sizeof(cint).} = enum
+    NVRTC_SUCCESS = 0, NVRTC_ERROR_OUT_OF_MEMORY = 1,
+    NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2, NVRTC_ERROR_INVALID_INPUT = 3,
+    NVRTC_ERROR_INVALID_PROGRAM = 4, NVRTC_ERROR_INVALID_OPTION = 5,
+    NVRTC_ERROR_COMPILATION = 6, NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+    NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+    NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+    NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10, NVRTC_ERROR_INTERNAL_ERROR = 11,
+    NVRTC_ERROR_TIME_FILE_WRITE_FAILED = 12
+
+proc nvrtcCreateProgram*(prog: ptr nvrtcProgram; src: cstring; name: cstring;
+                        numHeaders: cint; headers: cstringArray;
+                        includeNames: cstringArray): nvrtcResult {.discardable, cdecl,
+    importc: "nvrtcCreateProgram", dynlib: libNvrtc.}
+
+proc nvrtcDestroyProgram*(prog: ptr nvrtcProgram): nvrtcResult {.discardable, cdecl,
+    importc: "nvrtcDestroyProgram", dynlib: libNvrtc.}
+
+proc nvrtcCompileProgram*(prog: nvrtcProgram; numOptions: cint; options: cstringArray): nvrtcResult {.discardable,
+    cdecl, importc: "nvrtcCompileProgram", dynlib: libNvrtc.}
+
+proc nvrtcGetPTXSize*(prog: nvrtcProgram; ptxSizeRet: ptr csize_t): nvrtcResult {.discardable,
+    cdecl, importc: "nvrtcGetPTXSize", dynlib: libNvrtc.}
+
+proc nvrtcGetPTX*(prog: nvrtcProgram; ptx: cstring): nvrtcResult {.discardable, cdecl,
+    importc: "nvrtcGetPTX", dynlib: libNvrtc.}
+
+proc nvrtcGetProgramLogSize*(prog: nvrtcProgram; logSizeRet: ptr csize_t): nvrtcResult {.discardable,
+    cdecl, importc: "nvrtcGetProgramLogSize", dynlib: libNvrtc.}
+
+proc nvrtcGetProgramLog*(prog: nvrtcProgram; log: cstring): nvrtcResult {.discardable, cdecl,
+    importc: "nvrtcGetProgramLog", dynlib: libNvrtc.}
+
+
+######################################################################
+################################ libcudart #############################
+######################################################################
+
+when defined(windows):
+  const
+    libCudaRT = "cudart.dll"
+elif defined(macosx):
+  const
+    libCudaRT = "libcudart.dylib"
+else:
+  const
+    libCudaRT = "libcudart.so"
+
+type
+  cudaError* = enum ##
+                 ##  The API call returned with no errors. In the case of query calls, this
+                 ##  also means that the operation being queried is complete (see
+                 ##  ::cudaEventQuery() and ::cudaStreamQuery()).
+                 ##
+    cudaSuccess = 0, ##
+                  ##  This indicates that one or more of the parameters passed to the API call
+                  ##  is not within an acceptable range of values.
+                  ##
+    cudaErrorInvalidValue = 1, ##
+                            ##  The API call failed because it was unable to allocate enough memory or
+                            ##  other resources to perform the requested operation.
+                            ##
+    cudaErrorMemoryAllocation = 2, ##
+                                ##  The API call failed because the CUDA driver and runtime could not be
+                                ##  initialized.
+                                ##
+    cudaErrorInitializationError = 3, ##
+                                   ##  This indicates that a CUDA Runtime API call cannot be executed because
+                                   ##  it is being called during process shut down, at a point in time after
+                                   ##  CUDA driver has been unloaded.
+                                   ##
+    cudaErrorCudartUnloading = 4, ##
+                               ##  This indicates profiler is not initialized for this run. This can
+                               ##  happen when the application is running with external profiling tools
+                               ##  like visual profiler.
+                               ##
+    cudaErrorProfilerDisabled = 5, ##
+                                ##  \deprecated
+                                ##  This error return is deprecated as of CUDA 5.0. It is no longer an error
+                                ##  to attempt to enable/disable the profiling via ::cudaProfilerStart or
+                                ##  ::cudaProfilerStop without initialization.
+                                ##
+    cudaErrorProfilerNotInitialized = 6, ##
+                                      ##  \deprecated
+                                      ##  This error return is deprecated as of CUDA 5.0. It is no longer an error
+                                      ##  to call cudaProfilerStart() when profiling is already enabled.
+                                      ##
+    cudaErrorProfilerAlreadyStarted = 7, ##
+                                      ##  \deprecated
+                                      ##  This error return is deprecated as of CUDA 5.0. It is no longer an error
+                                      ##  to call cudaProfilerStop() when profiling is already disabled.
+                                      ##
+    cudaErrorProfilerAlreadyStopped = 8, ##
+                                      ##  This indicates that a kernel launch is requesting resources that can
+                                      ##  never be satisfied by the current device. Requesting more shared memory
+                                      ##  per block than the device supports will trigger this error, as will
+                                      ##  requesting too many threads or blocks. See ::cudaDeviceProp for more
+                                      ##  device limitations.
+                                      ##
+    cudaErrorInvalidConfiguration = 9, ##
+                                    ##  This indicates that one or more of the pitch-related parameters passed
+                                    ##  to the API call is not within the acceptable range for pitch.
+                                    ##
+    cudaErrorInvalidPitchValue = 12, ##
+                                  ##  This indicates that the symbol name/identifier passed to the API call
+                                  ##  is not a valid name or identifier.
+                                  ##
+    cudaErrorInvalidSymbol = 13, ##
+                              ##  This indicates that at least one host pointer passed to the API call is
+                              ##  not a valid host pointer.
+                              ##  \deprecated
+                              ##  This error return is deprecated as of CUDA 10.1.
+                              ##
+    cudaErrorInvalidHostPointer = 16, ##
+                                   ##  This indicates that at least one device pointer passed to the API call is
+                                   ##  not a valid device pointer.
+                                   ##  \deprecated
+                                   ##  This error return is deprecated as of CUDA 10.1.
+                                   ##
+    cudaErrorInvalidDevicePointer = 17, ##
+                                     ##  This indicates that the texture passed to the API call is not a valid
+                                     ##  texture.
+                                     ##
+    cudaErrorInvalidTexture = 18, ##
+                               ##  This indicates that the texture binding is not valid. This occurs if you
+                               ##  call ::cudaGetTextureAlignmentOffset() with an unbound texture.
+                               ##
+    cudaErrorInvalidTextureBinding = 19, ##
+                                      ##  This indicates that the channel descriptor passed to the API call is not
+                                      ##  valid. This occurs if the format is not one of the formats specified by
+                                      ##  ::cudaChannelFormatKind, or if one of the dimensions is invalid.
+                                      ##
+    cudaErrorInvalidChannelDescriptor = 20, ##
+                                         ##  This indicates that the direction of the copyMem passed to the API call is
+                                         ##  not one of the types specified by ::cudaMemcpyKind.
+                                         ##
+    cudaErrorInvalidMemcpyDirection = 21, ##
+                                       ##  This indicated that the user has taken the address of a constant variable,
+                                       ##  which was forbidden up until the CUDA 3.1 release.
+                                       ##  \deprecated
+                                       ##  This error return is deprecated as of CUDA 3.1. Variables in constant
+                                       ##  memory may now have their address taken by the runtime via
+                                       ##  ::cudaGetSymbolAddress().
+                                       ##
+    cudaErrorAddressOfConstant = 22, ##
+                                  ##  This indicated that a texture fetch was not able to be performed.
+                                  ##  This was previously used for device emulation of texture operations.
+                                  ##  \deprecated
+                                  ##  This error return is deprecated as of CUDA 3.1. Device emulation mode was
+                                  ##  removed with the CUDA 3.1 release.
+                                  ##
+    cudaErrorTextureFetchFailed = 23, ##
+                                   ##  This indicated that a texture was not bound for access.
+                                   ##  This was previously used for device emulation of texture operations.
+                                   ##  \deprecated
+                                   ##  This error return is deprecated as of CUDA 3.1. Device emulation mode was
+                                   ##  removed with the CUDA 3.1 release.
+                                   ##
+    cudaErrorTextureNotBound = 24, ##
+                                ##  This indicated that a synchronization operation had failed.
+                                ##  This was previously used for some device emulation functions.
+                                ##  \deprecated
+                                ##  This error return is deprecated as of CUDA 3.1. Device emulation mode was
+                                ##  removed with the CUDA 3.1 release.
+                                ##
+    cudaErrorSynchronizationError = 25, ##
+                                     ##  This indicates that a non-float texture was being accessed with linear
+                                     ##  filtering. This is not supported by CUDA.
+                                     ##
+    cudaErrorInvalidFilterSetting = 26, ##
+                                     ##  This indicates that an attempt was made to read a non-float texture as a
+                                     ##  normalized float. This is not supported by CUDA.
+                                     ##
+    cudaErrorInvalidNormSetting = 27, ##
+                                   ##  Mixing of device and device emulation code was not allowed.
+                                   ##  \deprecated
+                                   ##  This error return is deprecated as of CUDA 3.1. Device emulation mode was
+                                   ##  removed with the CUDA 3.1 release.
+                                   ##
+    cudaErrorMixedDeviceExecution = 28, ##
+                                     ##  This indicates that the API call is not yet implemented. Production
+                                     ##  releases of CUDA will never return this error.
+                                     ##  \deprecated
+                                     ##  This error return is deprecated as of CUDA 4.1.
+                                     ##
+    cudaErrorNotYetImplemented = 31, ##
+                                  ##  This indicated that an emulated device pointer exceeded the 32-bit address
+                                  ##  range.
+                                  ##  \deprecated
+                                  ##  This error return is deprecated as of CUDA 3.1. Device emulation mode was
+                                  ##  removed with the CUDA 3.1 release.
+                                  ##
+    cudaErrorMemoryValueTooLarge = 32, ##
+                                    ##  This indicates that the CUDA driver that the application has loaded is a
+                                    ##  stub library. Applications that run with the stub rather than a real
+                                    ##  driver loaded will resultNotKeyWord in CUDA API returning this error.
+                                    ##
+    cudaErrorStubLibrary = 34, ##
+                            ##  This indicates that the installed NVIDIA CUDA driver is older than the
+                            ##  CUDA runtime library. This is not a supported configuration. Users should
+                            ##  install an updated NVIDIA display driver to allow the application to run.
+                            ##
+    cudaErrorInsufficientDriver = 35, ##
+                                   ##  This indicates that the API call requires a newer CUDA driver than the one
+                                   ##  currently installed. Users should install an updated NVIDIA CUDA driver
+                                   ##  to allow the API call to succeed.
+                                   ##
+    cudaErrorCallRequiresNewerDriver = 36, ##
+                                        ##  This indicates that the surface passed to the API call is not a valid
+                                        ##  surface.
+                                        ##
+    cudaErrorInvalidSurface = 37, ##
+                               ##  This indicates that multiple global or constant variables (across separate
+                               ##  CUDA source files in the application) share the same string name.
+                               ##
+    cudaErrorDuplicateVariableName = 43, ##
+                                      ##  This indicates that multiple textures (across separate CUDA source
+                                      ##  files in the application) share the same string name.
+                                      ##
+    cudaErrorDuplicateTextureName = 44, ##
+                                     ##  This indicates that multiple surfaces (across separate CUDA source
+                                     ##  files in the application) share the same string name.
+                                     ##
+    cudaErrorDuplicateSurfaceName = 45, ##
+                                     ##  This indicates that all CUDA devices are busy or unavailable at the current
+                                     ##  time. Devices are often busy/unavailable due to use of
+                                     ##  ::cudaComputeModeProhibited, ::cudaComputeModeExclusiveProcess, or when long
+                                     ##  running CUDA kernels have filled up the GPU and are blocking new work
+                                     ##  from starting. They can also be unavailable due to memory constraints
+                                     ##  on a device that already has active CUDA work being performed.
+                                     ##
+    cudaErrorDevicesUnavailable = 46, ##
+                                   ##  This indicates that the current context is not compatible with this
+                                   ##  the CUDA Runtime. This can only occur if you are using CUDA
+                                   ##  Runtime/Driver interoperability and have created an existing Driver
+                                   ##  context using the driver API. The Driver context may be incompatible
+                                   ##  either because the Driver context was created using an older version
+                                   ##  of the API, because the Runtime API call expects a primary driver
+                                   ##  context and the Driver context is not primary, or because the Driver
+                                   ##  context has been destroyed. Please see \ref CUDART_DRIVER "Interactions
+                                   ##  with the CUDA Driver API" for more information.
+                                   ##
+    cudaErrorIncompatibleDriverContext = 49, ##
+                                          ##  The device function being invoked (usually via ::cudaLaunchKernel()) was not
+                                          ##  previously configured via the ::cudaConfigureCall() function.
+                                          ##
+    cudaErrorMissingConfiguration = 52, ##
+                                     ##  This indicated that a previous kernel launch failed. This was previously
+                                     ##  used for device emulation of kernel launches.
+                                     ##  \deprecated
+                                     ##  This error return is deprecated as of CUDA 3.1. Device emulation mode was
+                                     ##  removed with the CUDA 3.1 release.
+                                     ##
+    cudaErrorPriorLaunchFailure = 53, ##
+                                   ##  This error indicates that a device runtime grid launch did not occur
+                                   ##  because the depth of the child grid would exceed the maximum supported
+                                   ##  number of nested grid launches.
+                                   ##
+    cudaErrorLaunchMaxDepthExceeded = 65, ##
+                                       ##  This error indicates that a grid launch did not occur because the kernel
+                                       ##  uses file-scoped textures which are unsupported by the device runtime.
+                                       ##  Kernels launched via the device runtime only support textures created with
+                                       ##  the Texture Object API's.
+                                       ##
+    cudaErrorLaunchFileScopedTex = 66, ##
+                                    ##  This error indicates that a grid launch did not occur because the kernel
+                                    ##  uses file-scoped surfaces which are unsupported by the device runtime.
+                                    ##  Kernels launched via the device runtime only support surfaces created with
+                                    ##  the Surface Object API's.
+                                    ##
+    cudaErrorLaunchFileScopedSurf = 67, ##
+                                     ##  This error indicates that a call to ::cudaDeviceSynchronize made from
+                                     ##  the device runtime failed because the call was made at grid depth greater
+                                     ##  than than either the default (2 levels of grids) or user specified device
+                                     ##  limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on
+                                     ##  launched grids at a greater depth successfully, the maximum nested
+                                     ##  depth at which ::cudaDeviceSynchronize will be called must be specified
+                                     ##  with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit
+                                     ##  api before the host-side launch of a kernel using the device runtime.
+                                     ##  Keep in mind that additional levels of sync depth require the runtime
+                                     ##  to reserve large amounts of device memory that cannot be used for
+                                     ##  user allocations. Note that ::cudaDeviceSynchronize made from device
+                                     ##  runtime is only supported on devices of compute capability < 9.0.
+                                     ##
+    cudaErrorSyncDepthExceeded = 68, ##
+                                  ##  This error indicates that a device runtime grid launch failed because
+                                  ##  the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount.
+                                  ##  For this launch to proceed successfully, ::cudaDeviceSetLimit must be
+                                  ##  called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher
+                                  ##  than the upper bound of outstanding launches that can be issued to the
+                                  ##  device runtime. Keep in mind that raising the limit of pending device
+                                  ##  runtime launches will require the runtime to reserve device memory that
+                                  ##  cannot be used for user allocations.
+                                  ##
+    cudaErrorLaunchPendingCountExceeded = 69, ##
+                                           ##  The requested device function does not exist or is not compiled for the
+                                           ##  proper device architecture.
+                                           ##
+    cudaErrorInvalidDeviceFunction = 98, ##
+                                      ##  This indicates that no CUDA-capable devices were detected by the installed
+                                      ##  CUDA driver.
+                                      ##
+    cudaErrorNoDevice = 100, ##
+                          ##  This indicates that the device ordinal supplied by the user does not
+                          ##  correspond to a valid CUDA device or that the action requested is
+                          ##  invalid for the specified device.
+                          ##
+    cudaErrorInvalidDevice = 101, ##
+                               ##  This indicates that the device doesn't have a valid Grid License.
+                               ##
+    cudaErrorDeviceNotLicensed = 102, ##
+                                   ##  By default, the CUDA runtime may perform a minimal set of self-tests,
+                                   ##  as well as CUDA driver tests, to establish the validity of both.
+                                   ##  Introduced in CUDA 11.2, this error return indicates that at least one
+                                   ##  of these tests has failed and the validity of either the runtime
+                                   ##  or the driver could not be established.
+                                   ##
+    cudaErrorSoftwareValidityNotEstablished = 103, ##
+                                                ##  This indicates an internal startup failure in the CUDA runtime.
+                                                ##
+    cudaErrorStartupFailure = 127, ##
+                                ##  This indicates that the device kernel image is invalid.
+                                ##
+    cudaErrorInvalidKernelImage = 200, ##
+                                    ##  This most frequently indicates that there is no context bound to the
+                                    ##  current thread. This can also be returned if the context passed to an
+                                    ##  API call is not a valid handle (such as a context that has had
+                                    ##  ::cuCtxDestroy() invoked on it). This can also be returned if a user
+                                    ##  mixes different API versions (i.e. 3010 context with 3020 API calls).
+                                    ##  See ::cuCtxGetApiVersion() for more details.
+                                    ##
+    cudaErrorDeviceUninitialized = 201, ##
+                                     ##  This indicates that the buffer object could not be mapped.
+                                     ##
+    cudaErrorMapBufferObjectFailed = 205, ##
+                                       ##  This indicates that the buffer object could not be unmapped.
+                                       ##
+    cudaErrorUnmapBufferObjectFailed = 206, ##
+                                         ##  This indicates that the specified array is currently mapped and thus
+                                         ##  cannot be destroyed.
+                                         ##
+    cudaErrorArrayIsMapped = 207, ##
+                               ##  This indicates that the resource is already mapped.
+                               ##
+    cudaErrorAlreadyMapped = 208, ##
+                               ##  This indicates that there is no kernel image available that is suitable
+                               ##  for the device. This can occur when a user specifies code generation
+                               ##  options for a particular CUDA source file that do not include the
+                               ##  corresponding device configuration.
+                               ##
+    cudaErrorNoKernelImageForDevice = 209, ##
+                                        ##  This indicates that a resource has already been acquired.
+                                        ##
+    cudaErrorAlreadyAcquired = 210, ##
+                                 ##  This indicates that a resource is not mapped.
+                                 ##
+    cudaErrorNotMapped = 211, ##
+                           ##  This indicates that a mapped resource is not available for access as an
+                           ##  array.
+                           ##
+    cudaErrorNotMappedAsArray = 212, ##
+                                  ##  This indicates that a mapped resource is not available for access as a
+                                  ##  pointer.
+                                  ##
+    cudaErrorNotMappedAsPointer = 213, ##
+                                    ##  This indicates that an uncorrectable ECC error was detected during
+                                    ##  execution.
+                                    ##
+    cudaErrorECCUncorrectable = 214, ##
+                                  ##  This indicates that the ::cudaLimit passed to the API call is not
+                                  ##  supported by the active device.
+                                  ##
+    cudaErrorUnsupportedLimit = 215, ##
+                                  ##  This indicates that a call tried to access an exclusive-thread device that
+                                  ##  is already in use by a different thread.
+                                  ##
+    cudaErrorDeviceAlreadyInUse = 216, ##
+                                    ##  This error indicates that P2P access is not supported across the given
+                                    ##  devices.
+                                    ##
+    cudaErrorPeerAccessUnsupported = 217, ##
+                                       ##  A PTX compilation failed. The runtime may fall back to compiling PTX if
+                                       ##  an application does not contain a suitable binary for the current device.
+                                       ##
+    cudaErrorInvalidPtx = 218, ##
+                            ##  This indicates an error with the OpenGL or DirectX context.
+                            ##
+    cudaErrorInvalidGraphicsContext = 219, ##
+                                        ##  This indicates that an uncorrectable NVLink error was detected during the
+                                        ##  execution.
+                                        ##
+    cudaErrorNvlinkUncorrectable = 220, ##
+                                     ##  This indicates that the PTX JIT compiler library was not found. The JIT Compiler
+                                     ##  library is used for PTX compilation. The runtime may fall back to compiling PTX
+                                     ##  if an application does not contain a suitable binary for the current device.
+                                     ##
+    cudaErrorJitCompilerNotFound = 221, ##
+                                     ##  This indicates that the provided PTX was compiled with an unsupported toolchain.
+                                     ##  The most common reason for this, is the PTX was generated by a compiler newer
+                                     ##  than what is supported by the CUDA driver and PTX JIT compiler.
+                                     ##
+    cudaErrorUnsupportedPtxVersion = 222, ##
+                                       ##  This indicates that the JIT compilation was disabled. The JIT compilation compiles
+                                       ##  PTX. The runtime may fall back to compiling PTX if an application does not contain
+                                       ##  a suitable binary for the current device.
+                                       ##
+    cudaErrorJitCompilationDisabled = 223, ##
+                                        ##  This indicates that the provided execution affinity is not supported by the device.
+                                        ##
+    cudaErrorUnsupportedExecAffinity = 224, ##
+                                         ##  This indicates that the code to be compiled by the PTX JIT contains
+                                         ##  unsupported call to cudaDeviceSynchronize.
+                                         ##
+    cudaErrorUnsupportedDevSideSync = 225, ##
+                                        ##  This indicates that the device kernel source is invalid.
+                                        ##
+    cudaErrorInvalidSource = 300, ##
+                               ##  This indicates that the file specified was not found.
+                               ##
+    cudaErrorFileNotFound = 301, ##
+                              ##  This indicates that a link to a shared object failed to resolve.
+                              ##
+    cudaErrorSharedObjectSymbolNotFound = 302, ##
+                                            ##  This indicates that initialization of a shared object failed.
+                                            ##
+    cudaErrorSharedObjectInitFailed = 303, ##
+                                        ##  This error indicates that an OS call failed.
+                                        ##
+    cudaErrorOperatingSystem = 304, ##
+                                 ##  This indicates that a resource handle passed to the API call was not
+                                 ##  valid. Resource handles are opaque types like ::cudaStream_t and
+                                 ##  ::cudaEvent_t.
+                                 ##
+    cudaErrorInvalidResourceHandle = 400, ##
+                                       ##  This indicates that a resource required by the API call is not in a
+                                       ##  valid state to perform the requested operation.
+                                       ##
+    cudaErrorIllegalState = 401, ##
+                              ##  This indicates an attempt was made to introspect an object in a way that
+                              ##  would discard semantically important information. This is either due to
+                              ##  the object using funtionality newer than the API version used to
+                              ##  introspect it or omission of optional return arguments.
+                              ##
+    cudaErrorLossyQuery = 402, ##
+                            ##  This indicates that a named symbol was not found. Examples of symbols
+                            ##  are global/constant variable names, driver function names, texture names,
+                            ##  and surface names.
+                            ##
+    cudaErrorSymbolNotFound = 500, ##
+                                ##  This indicates that asynchronous operations issued previously have not
+                                ##  completed yet. This resultNotKeyWord is not actually an error, but must be indicated
+                                ##  differently than ::cudaSuccess (which indicates completion). Calls that
+                                ##  may return this value include ::cudaEventQuery() and ::cudaStreamQuery().
+                                ##
+    cudaErrorNotReady = 600, ##
+                          ##  The device encountered a load or store instruction on an invalid memory address.
+                          ##  This leaves the process in an inconsistent state and any further CUDA work
+                          ##  will return the same error. To continue using CUDA, the process must be terminated
+                          ##  and relaunched.
+                          ##
+    cudaErrorIllegalAddress = 700, ##
+                                ##  This indicates that a launch did not occur because it did not have
+                                ##  appropriate resources. Although this error is similar to
+                                ##  ::cudaErrorInvalidConfiguration, this error usually indicates that the
+                                ##  user has attempted to pass too many arguments to the device kernel, or the
+                                ##  kernel launch specifies too many threads for the kernel's register count.
+                                ##
+    cudaErrorLaunchOutOfResources = 701, ##
+                                      ##  This indicates that the device kernel took too long to execute. This can
+                                      ##  only occur if timeouts are enabled - see the device property
+                                      ##  \ref
+                                      ## ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"
+                                      ##  for more information.
+                                      ##  This leaves the process in an inconsistent state and any further CUDA work
+                                      ##  will return the same error. To continue using CUDA, the process must be terminated
+                                      ##  and relaunched.
+                                      ##
+    cudaErrorLaunchTimeout = 702, ##
+                               ##  This error indicates a kernel launch that uses an incompatible texturing
+                               ##  mode.
+                               ##
+    cudaErrorLaunchIncompatibleTexturing = 703, ##
+                                             ##  This error indicates that a call to ::cudaDeviceEnablePeerAccess() is
+                                             ##  trying to re-enable peer addressing on from a context which has already
+                                             ##  had peer addressing enabled.
+                                             ##
+    cudaErrorPeerAccessAlreadyEnabled = 704, ##
+                                          ##  This error indicates that ::cudaDeviceDisablePeerAccess() is trying to
+                                          ##  disable peer addressing which has not been enabled yet via
+                                          ##  ::cudaDeviceEnablePeerAccess().
+                                          ##
+    cudaErrorPeerAccessNotEnabled = 705, ##
+                                      ##  This indicates that the user has called ::cudaSetValidDevices(),
+                                      ##  ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(),
+                                      ##  ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or
+                                      ##  ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by
+                                      ##  calling non-device management operations (allocating memory and
+                                      ##  launching kernels are examples of non-device management operations).
+                                      ##  This error can also be returned if using runtime/driver
+                                      ##  interoperability and there is an existing ::CUcontext active on the
+                                      ##  host thread.
+                                      ##
+    cudaErrorSetOnActiveProcess = 708, ##
+                                    ##  This error indicates that the context current to the calling thread
+                                    ##  has been destroyed using ::cuCtxDestroy, or is a primary context which
+                                    ##  has not yet been initialized.
+                                    ##
+    cudaErrorContextIsDestroyed = 709, ##
+                                    ##  An assert triggered in device code during kernel execution. The device
+                                    ##  cannot be used again. All existing allocations are invalid. To continue
+                                    ##  using CUDA, the process must be terminated and relaunched.
+                                    ##
+    cudaErrorAssert = 710, ##
+                        ##  This error indicates that the hardware resources required to enable
+                        ##  peer access have been exhausted for one or more of the devices
+                        ##  passed to ::cudaEnablePeerAccess().
+                        ##
+    cudaErrorTooManyPeers = 711, ##
+                              ##  This error indicates that the memory range passed to ::cudaHostRegister()
+                              ##  has already been registered.
+                              ##
+    cudaErrorHostMemoryAlreadyRegistered = 712, ##
+                                             ##  This error indicates that the pointer passed to ::cudaHostUnregister()
+                                             ##  does not correspond to any currently registered memory region.
+                                             ##
+    cudaErrorHostMemoryNotRegistered = 713, ##
+                                         ##  Device encountered an error in the call stack during kernel execution,
+                                         ##  possibly due to stack corruption or exceeding the stack size limit.
+                                         ##  This leaves the process in an inconsistent state and any further CUDA work
+                                         ##  will return the same error. To continue using CUDA, the process must be terminated
+                                         ##  and relaunched.
+                                         ##
+    cudaErrorHardwareStackError = 714, ##
+                                    ##  The device encountered an illegal instruction during kernel execution
+                                    ##  This leaves the process in an inconsistent state and any further CUDA work
+                                    ##  will return the same error. To continue using CUDA, the process must be terminated
+                                    ##  and relaunched.
+                                    ##
+    cudaErrorIllegalInstruction = 715, ##
+                                    ##  The device encountered a load or store instruction
+                                    ##  on a memory address which is not aligned.
+                                    ##  This leaves the process in an inconsistent state and any further CUDA work
+                                    ##  will return the same error. To continue using CUDA, the process must be terminated
+                                    ##  and relaunched.
+                                    ##
+    cudaErrorMisalignedAddress = 716, ##
+                                   ##  While executing a kernel, the device encountered an instruction
+                                   ##  which can only operate on memory locations in certain address spaces
+                                   ##  (global, shared, or local), but was supplied a memory address not
+                                   ##  belonging to an allowed address space.
+                                   ##  This leaves the process in an inconsistent state and any further CUDA work
+                                   ##  will return the same error. To continue using CUDA, the process must be terminated
+                                   ##  and relaunched.
+                                   ##
+    cudaErrorInvalidAddressSpace = 717, ##
+                                     ##  The device encountered an invalid program counter.
+                                     ##  This leaves the process in an inconsistent state and any further CUDA work
+                                     ##  will return the same error. To continue using CUDA, the process must be terminated
+                                     ##  and relaunched.
+                                     ##
+    cudaErrorInvalidPc = 718, ##
+                           ##  An exception occurred on the device while executing a kernel. Common
+                           ##  causes include dereferencing an invalid device pointer and accessing
+                           ##  out of bounds shared memory. Less common cases can be system specific - more
+                           ##  information about these cases can be found in the system specific user guide.
+                           ##  This leaves the process in an inconsistent state and any further CUDA work
+                           ##  will return the same error. To continue using CUDA, the process must be terminated
+                           ##  and relaunched.
+                           ##
+    cudaErrorLaunchFailure = 719, ##
+                               ##  This error indicates that the number of blocks launched per grid for a kernel that was
+                               ##  launched via either ::cudaLaunchCooperativeKernel or ::cudaLaunchCooperativeKernelMultiDevice
+                               ##  exceeds the maximum number of blocks as allowed by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+                               ##  or
+                               ## ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+                               ##  as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+                               ##
+    cudaErrorCooperativeLaunchTooLarge = 720, ##
+                                           ##  This error indicates the attempted operation is not permitted.
+                                           ##
+    cudaErrorNotPermitted = 800, ##
+                              ##  This error indicates the attempted operation is not supported
+                              ##  on the current system or device.
+                              ##
+    cudaErrorNotSupported = 801, ##
+                              ##  This error indicates that the system is not yet ready to start any CUDA
+                              ##  work.  To continue using CUDA, verify the system configuration is in a
+                              ##  valid state and all required driver daemons are actively running.
+                              ##  More information about this error can be found in the system specific
+                              ##  user guide.
+                              ##
+    cudaErrorSystemNotReady = 802, ##
+                                ##  This error indicates that there is a mismatch between the versions of
+                                ##  the display driver and the CUDA driver. Refer to the compatibility documentation
+                                ##  for supported versions.
+                                ##
+    cudaErrorSystemDriverMismatch = 803, ##
+                                      ##  This error indicates that the system was upgraded to run with forward compatibility
+                                      ##  but the visible hardware detected by CUDA does not support this configuration.
+                                      ##  Refer to the compatibility documentation for the supported hardware matrix or ensure
+                                      ##  that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+                                      ##  environment variable.
+                                      ##
+    cudaErrorCompatNotSupportedOnDevice = 804, ##
+                                            ##  This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
+                                            ##
+    cudaErrorMpsConnectionFailed = 805, ##
+                                     ##  This error indicates that the remote procedural call between the MPS server and the MPS client failed.
+                                     ##
+    cudaErrorMpsRpcFailure = 806, ##
+                               ##  This error indicates that the MPS server is not ready to accept new MPS client requests.
+                               ##  This error can be returned when the MPS server is in the process of recovering from a fatal failure.
+                               ##
+    cudaErrorMpsServerNotReady = 807, ##
+                                   ##  This error indicates that the hardware resources required to create MPS client have been exhausted.
+                                   ##
+    cudaErrorMpsMaxClientsReached = 808, ##
+                                      ##  This error indicates the the hardware resources required to device connections have been exhausted.
+                                      ##
+    cudaErrorMpsMaxConnectionsReached = 809, ##
+                                          ##  This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
+                                          ##
+    cudaErrorMpsClientTerminated = 810, ##
+                                     ##  This error indicates, that the program is using CUDA Dynamic Parallelism, but the current configuration, like MPS, does not support it.
+                                     ##
+    cudaErrorCdpNotSupported = 811, ##
+                                 ##  This error indicates, that the program contains an unsupported interaction between different versions of CUDA Dynamic Parallelism.
+                                 ##
+    cudaErrorCdpVersionMismatch = 812, ##
+                                    ##  The operation is not permitted when the stream is capturing.
+                                    ##
+    cudaErrorStreamCaptureUnsupported = 900, ##
+                                          ##  The current capture sequence on the stream has been invalidated due to
+                                          ##  a previous error.
+                                          ##
+    cudaErrorStreamCaptureInvalidated = 901, ##
+                                          ##  The operation would have resulted in a merge of two independent capture
+                                          ##  sequences.
+                                          ##
+    cudaErrorStreamCaptureMerge = 902, ##
+                                    ##  The capture was not initiated in this stream.
+                                    ##
+    cudaErrorStreamCaptureUnmatched = 903, ##
+                                        ##  The capture sequence contains a fork that was not joined to the primary
+                                        ##  stream.
+                                        ##
+    cudaErrorStreamCaptureUnjoined = 904, ##
+                                       ##  A dependency would have been created which crosses the capture sequence
+                                       ##  boundary. Only implicit in-stream ordering dependencies are allowed to
+                                       ##  cross the boundary.
+                                       ##
+    cudaErrorStreamCaptureIsolation = 905, ##
+                                        ##  The operation would have resulted in a disallowed implicit dependency on
+                                        ##  a current capture sequence from cudaStreamLegacy.
+                                        ##
+    cudaErrorStreamCaptureImplicit = 906, ##
+                                       ##  The operation is not permitted on an event which was last recorded in a
+                                       ##  capturing stream.
+                                       ##
+    cudaErrorCapturedEvent = 907, ##
+                               ##  A stream capture sequence not initiated with the ::cudaStreamCaptureModeRelaxed
+                               ##  argument to ::cudaStreamBeginCapture was passed to ::cudaStreamEndCapture in a
+                               ##  different thread.
+                               ##
+    cudaErrorStreamCaptureWrongThread = 908, ##
+                                          ##  This indicates that the wait operation has timed out.
+                                          ##
+    cudaErrorTimeout = 909, ##
+                         ##  This error indicates that the graph update was not performed because it included
+                         ##  changes which violated constraints specific to instantiated graph update.
+                         ##
+    cudaErrorGraphExecUpdateFailure = 910, ##
+                                        ##  This indicates that an async error has occurred in a device outside of CUDA.
+                                        ##  If CUDA was waiting for an external device's signal before consuming shared data,
+                                        ##  the external device signaled an error indicating that the data is not valid for
+                                        ##  consumption. This leaves the process in an inconsistent state and any further CUDA
+                                        ##  work will return the same error. To continue using CUDA, the process must be
+                                        ##  terminated and relaunched.
+                                        ##
+    cudaErrorExternalDevice = 911, ##
+                                ##  This indicates that a kernel launch error has occurred due to cluster
+                                ##  misconfiguration.
+                                ##
+    cudaErrorInvalidClusterSize = 912, ##
+                                    ##  This indicates that an unknown internal error has occurred.
+                                    ##
+    cudaErrorUnknown = 999, ##
+                         ##  Any unhandled CUDA driver error is added to this value and returned via
+                         ##  the runtime. Production releases of CUDA should not return such errors.
+                         ##  \deprecated
+                         ##  This error return is deprecated as of CUDA 4.1.
+                         ##
+    cudaErrorApiFailureBase = 10000
+
+  cudaError_t* = cudaError
+
+  CUstream_st = object
+  cudaStream_t* = ptr CUstream_st
+
+  CUevent_st = object
+  cudaEvent_t* = ptr CUevent_st
+
+
+  CUuuid_st* {.bycopy.} = object
+    ## < CUDA definition of UUID
+    bytes*: array[16, char]
+
+  CUuuid* = CUuuid_st
+  cudaUUID_t* = CUuuid_st
+
+  cudaDeviceProp* {.bycopy.} = object
+    name*: array[256, char]
+    ## < ASCII string identifying device
+    uuid*: cudaUUID_t
+    ## < 16-byte unique identifier
+    luid*: array[8, char]
+    ## < 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms
+    luidDeviceNodeMask*: cuint
+    ## < LUID device node mask. Value is undefined on TCC and non-Windows platforms
+    totalGlobalMem*: csize_t
+    ## < Global memory available on device in bytes
+    sharedMemPerBlock*: csize_t
+    ## < Shared memory available per block in bytes
+    regsPerBlock*: cint
+    ## < 32-bit registers available per block
+    warpSize*: cint
+    ## < Warp size in threads
+    memPitch*: csize_t
+    ## < Maximum pitch in bytes allowed by memory copies
+    maxThreadsPerBlock*: cint
+    ## < Maximum number of threads per block
+    maxThreadsDim*: array[3, cint]
+    ## < Maximum size of each dimension of a block
+    maxGridSize*: array[3, cint]
+    ## < Maximum size of each dimension of a grid
+    clockRate*: cint
+    ## < Deprecated, Clock frequency in kilohertz
+    totalConstMem*: csize_t
+    ## < Constant memory available on device in bytes
+    major*: cint
+    ## < Major compute capability
+    minor*: cint
+    ## < Minor compute capability
+    textureAlignment*: csize_t
+    ## < Alignment requirement for textures
+    texturePitchAlignment*: csize_t
+    ## < Pitch alignment requirement for texture references bound to pitched memory
+    deviceOverlap*: cint
+    ## < Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount.
+    multiProcessorCount*: cint
+    ## < Number of multiprocessors on device
+    kernelExecTimeoutEnabled*: cint
+    ## < Deprecated, Specified whether there is a run time limit on kernels
+    integrated*: cint
+    ## < Device is integrated as opposed to discrete
+    canMapHostMemory*: cint
+    ## < Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer
+    computeMode*: cint
+    ## < Deprecated, Compute mode (See ::cudaComputeMode)
+    maxTexture1D*: cint
+    ## < Maximum 1D texture size
+    maxTexture1DMipmap*: cint
+    ## < Maximum 1D mipmapped texture size
+    maxTexture1DLinear*: cint
+    ## < Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead.
+    maxTexture2D*: array[2, cint]
+    ## < Maximum 2D texture dimensions
+    maxTexture2DMipmap*: array[2, cint]
+    ## < Maximum 2D mipmapped texture dimensions
+    maxTexture2DLinear*: array[3, cint]
+    ## < Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory
+    maxTexture2DGather*: array[2, cint]
+    ## < Maximum 2D texture dimensions if texture gather operations have to be performed
+    maxTexture3D*: array[3, cint]
+    ## < Maximum 3D texture dimensions
+    maxTexture3DAlt*: array[3, cint]
+    ## < Maximum alternate 3D texture dimensions
+    maxTextureCubemap*: cint
+    ## < Maximum Cubemap texture dimensions
+    maxTexture1DLayered*: array[2, cint]
+    ## < Maximum 1D layered texture dimensions
+    maxTexture2DLayered*: array[3, cint]
+    ## < Maximum 2D layered texture dimensions
+    maxTextureCubemapLayered*: array[2, cint]
+    ## < Maximum Cubemap layered texture dimensions
+    maxSurface1D*: cint
+    ## < Maximum 1D surface size
+    maxSurface2D*: array[2, cint]
+    ## < Maximum 2D surface dimensions
+    maxSurface3D*: array[3, cint]
+    ## < Maximum 3D surface dimensions
+    maxSurface1DLayered*: array[2, cint]
+    ## < Maximum 1D layered surface dimensions
+    maxSurface2DLayered*: array[3, cint]
+    ## < Maximum 2D layered surface dimensions
+    maxSurfaceCubemap*: cint
+    ## < Maximum Cubemap surface dimensions
+    maxSurfaceCubemapLayered*: array[2, cint]
+    ## < Maximum Cubemap layered surface dimensions
+    surfaceAlignment*: csize_t
+    ## < Alignment requirements for surfaces
+    concurrentKernels*: cint
+    ## < Device can possibly execute multiple kernels concurrently
+    ECCEnabled*: cint
+    ## < Device has ECC support enabled
+    pciBusID*: cint
+    ## < PCI bus ID of the device
+    pciDeviceID*: cint
+    ## < PCI device ID of the device
+    pciDomainID*: cint
+    ## < PCI domain ID of the device
+    tccDriver*: cint
+    ## < 1 if device is a Tesla device using TCC driver, 0 otherwise
+    asyncEngineCount*: cint
+    ## < Number of asynchronous engines
+    unifiedAddressing*: cint
+    ## < Device shares a unified address space with the host
+    memoryClockRate*: cint
+    ## < Deprecated, Peak memory clock frequency in kilohertz
+    memoryBusWidth*: cint
+    ## < Global memory bus width in bits
+    l2CacheSize*: cint
+    ## < Size of L2 cache in bytes
+    persistingL2CacheMaxSize*: cint
+    ## < Device's maximum l2 persisting lines capacity setting in bytes
+    maxThreadsPerMultiProcessor*: cint
+    ## < Maximum resident threads per multiprocessor
+    streamPrioritiesSupported*: cint
+    ## < Device supports stream priorities
+    globalL1CacheSupported*: cint
+    ## < Device supports caching globals in L1
+    localL1CacheSupported*: cint
+    ## < Device supports caching locals in L1
+    sharedMemPerMultiprocessor*: csize_t
+    ## < Shared memory available per multiprocessor in bytes
+    regsPerMultiprocessor*: cint
+    ## < 32-bit registers available per multiprocessor
+    managedMemory*: cint
+    ## < Device supports allocating managed memory on this system
+    isMultiGpuBoard*: cint
+    ## < Device is on a multi-GPU board
+    multiGpuBoardGroupID*: cint
+    ## < Unique identifier for a group of devices on the same multi-GPU board
+    hostNativeAtomicSupported*: cint
+    ## < Link between the device and the host supports native atomic operations
+    singleToDoublePrecisionPerfRatio*: cint
+    ## < Deprecated, Ratio of single precision performance (in floating-point operations per second) to double precision performance
+    pageableMemoryAccess*: cint
+    ## < Device supports coherently accessing pageable memory without calling cudaHostRegister on it
+    concurrentManagedAccess*: cint
+    ## < Device can coherently access managed memory concurrently with the CPU
+    computePreemptionSupported*: cint
+    ## < Device supports Compute Preemption
+    canUseHostPointerForRegisteredMem*: cint
+    ## < Device can access host registered memory at the same virtual address as the CPU
+    cooperativeLaunch*: cint
+    ## < Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel
+    cooperativeMultiDeviceLaunch*: cint
+    ## < Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated.
+    sharedMemPerBlockOptin*: csize_t
+    ## < Per device maximum shared memory per block usable by special opt in
+    pageableMemoryAccessUsesHostPageTables*: cint
+    ## < Device accesses pageable memory via the host's page tables
+    directManagedMemAccessFromHost*: cint
+    ## < Host can directly access managed memory on the device without migration.
+    maxBlocksPerMultiProcessor*: cint
+    ## < Maximum number of resident blocks per multiprocessor
+    accessPolicyMaxWindowSize*: cint
+    ## < The maximum value of ::cudaAccessPolicyWindow::num_bytes.
+    reservedSharedMemPerBlock*: csize_t
+    ## < Shared memory reserved by CUDA driver per block in bytes
+    hostRegisterSupported*: cint
+    ## < Device supports host memory registration via ::cudaHostRegister.
+    sparseCudaArraySupported*: cint
+    ## < 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays, 0 otherwise
+    hostRegisterReadOnlySupported*: cint
+    ## < Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU
+    timelineSemaphoreInteropSupported*: cint
+    ## < External timeline semaphore interop is supported on the device
+    memoryPoolsSupported*: cint
+    ## < 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, 0 otherwise
+    gpuDirectRDMASupported*: cint
+    ## < 1 if the device supports GPUDirect RDMA APIs, 0 otherwise
+    gpuDirectRDMAFlushWritesOptions*: cuint
+    ## < Bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum
+    gpuDirectRDMAWritesOrdering*: cint
+    ## < See the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values
+    memoryPoolSupportedHandleTypes*: cuint
+    ## < Bitmask of handle types supported with mempool-based IPC
+    deferredMappingCudaArraySupported*: cint
+    ## < 1 if the device supports deferred mapping CUDA arrays and CUDA mipmapped arrays
+    ipcEventSupported*: cint
+    ## < Device supports IPC Events.
+    clusterLaunch*: cint
+    ## < Indicates device supports cluster launch
+    unifiedFunctionPointers*: cint
+    ## < Indicates device supports unified pointers
+    reserved2*: array[2, cint]
+    reserved1*: array[1, cint]
+    ## < Reserved for future use
+    reserved*: array[60, cint]
+    ## < Reserved for future use
+
+  ##
+  ##  CUDA memory copy types
+  ##
+  cudaMemcpyKind* = enum
+    cudaMemcpyHostToHost = 0, ## < Host   -> Host
+    cudaMemcpyHostToDevice = 1, ## < Host   -> Device
+    cudaMemcpyDeviceToHost = 2, ## < Device -> Host
+    cudaMemcpyDeviceToDevice = 3, ## < Device -> Device
+    cudaMemcpyDefault = 4     ## < Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing
+
+
+proc cudaRuntimeGetVersion*(runtimeVersion: ptr cint): cudaError_t {.cdecl,
+    importc: "cudaRuntimeGetVersion", dynlib: libCudaRT.}
+
+proc cudaGetDeviceProperties*(prop: ptr cudaDeviceProp; device: cint): cudaError_t {.
+    cdecl, importc: "cudaGetDeviceProperties", dynlib: libCudaRT.}
+
+proc cudaEventCreate*(event: ptr cudaEvent_t): cudaError_t {.cdecl,
+    importc: "cudaEventCreate", dynlib: libCudaRT.}
+
+proc cudaEventRecord*(event: cudaEvent_t; stream: cudaStream_t): cudaError_t {.
+    cdecl, importc: "cudaEventRecord", dynlib: libCudaRT.}
+
+proc cudaEventSynchronize*(event: cudaEvent_t): cudaError_t {.cdecl,
+    importc: "cudaEventSynchronize", dynlib: libCudaRT.}
+
+proc cudaDeviceSynchronize*(): cudaError_t {.cdecl,
+    importc: "cudaDeviceSynchronize", dynlib: libCudaRT.}
+
+proc cudaEventElapsedTime*(ms: ptr cfloat; start: cudaEvent_t; `end`: cudaEvent_t): cudaError_t {.
+    cdecl, importc: "cudaEventElapsedTime", dynlib: libCudaRT.}
+
+proc cudaEventDestroy*(event: cudaEvent_t): cudaError_t {.cdecl,
+      importc: "cudaEventDestroy", dynlib: libCudaRT.}
+
+proc cudaMemcpyToSymbol*(symbol: pointer,
+                         src: pointer,
+                         count, offset: csize_t,
+                         kind: cudaMemcpyKind = cudaMemcpyHostToDevice): cudaError_t {.
+    cdecl, importc: "cudaMemcpyToSymbol", dynlib: libCudaRT.}
+
+proc cudaGetSymbolAddress*(devPtr: ptr CUdeviceptr,
+                           symbol: pointer): cudaError_t {.cdecl,
+    importc: "cudaGetSymbolAddress", dynlib: libCudaRT.}
+
+
+
+######################################################################
+################################ Utilities ###########################
+######################################################################
+
+
+template check*(status: CUresult, quitOnFailure = true) =
+  ## Check the status code of a CUDA operation
+  ## Exit program with error if failure
+
+  let code = status # ensure that the input expression is evaluated once only
+  if code != CUDA_SUCCESS:
+    writeStackTrace()
+    stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
+    if quitOnFailure:
+      quit 1
+
+template check*(a: sink nvrtcResult, quitOnFailure = true) =
+  let code = a
+  if code != NVRTC_SUCCESS:
+    writeStackTrace()
+    stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
+    if quitOnFailure:
+      quit 1
+
+template check*(a: sink cudaError_t, quitOnFailure = true) =
+  let code = a
+  if code != cudaSuccess:
+    writeStackTrace()
+    stderr.write(astToStr(status) & " " & $instantiationInfo() & " exited with error: " & $code & '\n')
+    if quitOnFailure:
+      quit 1
diff --git a/constantine/platforms/static_for.nim b/constantine/platforms/static_for.nim
index c83ba3630..6f9179fdd 100644
--- a/constantine/platforms/static_for.nim
+++ b/constantine/platforms/static_for.nim
@@ -34,6 +34,14 @@ macro staticFor*(idx: untyped{nkIdent}, start, stopEx: static int, body: untyped
       ident("unrolledIter_" & $idx & $i),
       body.replaceNodes(idx, newLit i))
 
+macro staticForStepped*(idx: untyped{nkIdent}, start, stopEx, increment: static int, body: untyped): untyped =
+  ## Version of `staticFor` which takes an increment != 1.
+  result = newStmtList()
+  for i in countup(start, stopEx - increment, increment):
+    result.add nnkBlockStmt.newTree(
+      ident("unrolledIter_" & $idx & $i),
+      body.replaceNodes(idx, newLit i))
+
 macro staticForCountdown*(idx: untyped{nkIdent}, start, stopIncl: static int, body: untyped): untyped =
   result = newStmtList()
   for i in countdown(start, stopIncl):
diff --git a/tests/gpu/t_mul.nim b/tests/gpu/t_mul.nim
index d93538545..9c17362a4 100644
--- a/tests/gpu/t_mul.nim
+++ b/tests/gpu/t_mul.nim
@@ -42,4 +42,4 @@ testName(Fp[BN254_Snarks], 32, a, b)
 # We get incorrect result for modular multiplication with 64-bit limbs due to a fused-multiuply-add with carry bug.
 #
 # - https://gist.github.com/mratsim/a34df1e091925df15c13208df7eda569#file-mul-py
-# - https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
\ No newline at end of file
+# - https://forums.developer.nvidia.com/t/incorrect-result-of-ptx-code/221067
diff --git a/tests/gpu/t_nvrtc_bigint_example.nim b/tests/gpu/t_nvrtc_bigint_example.nim
new file mode 100644
index 000000000..8a457598a
--- /dev/null
+++ b/tests/gpu/t_nvrtc_bigint_example.nim
@@ -0,0 +1,252 @@
+import std / strformat
+
+import
+  # Internal
+  constantine/named/algebras,
+  constantine/math/io/[io_bigints, io_fields],
+  constantine/math/arithmetic,
+  constantine/platforms/abstractions {.all.},
+  constantine/platforms/abis/nvidia_abi,
+  constantine/math_compiler/experimental/runtime_compile,
+  constantine/serialization/io_limbs
+
+type T = Fp[BN254_Snarks]
+const WordSize = 32
+# Example showing warp behavior with different thread counts
+const BigIntExample = cuda:
+  # Utility for add with carry operations
+  type
+    BigInt = object
+      limbs: array[8, uint32]
+  template `[]`(x: BigInt, idx: int): untyped = x.limbs[idx]
+  template `[]=`(x: BigInt, idx: int, val: uint32): untyped = x.limbs[idx] = val
+  template len(x: BigInt): int = 8 # static: BigInt().limbs.len
+
+  # Need to get the limbs & spare bits data in a static context
+  template getFieldModulus(): untyped = static: T.getModulus().limbs
+  template spareBits(): untyped = static: (BigInt().limbs.len * WordSize - T.bits())
+
+  ## Note: the below would just be generated from a macro of course, similar to
+  ## `constantine/platforms/llvm/asm_nvidia.nim`.
+  proc add_cio(a, b: uint32): uint32 {.device, forceinline.} =
+    var res: uint32
+    asm """
+"addc.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc add_ci(a, b: uint32): uint32 {.device, forceinline.} =
+    var res: uint32
+    asm """
+"addc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc add_co(a, b: uint32): uint32 {.device, forceinline.} =
+    var res: uint32
+    asm """
+"add.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc sub_bo(a, b: uint32): uint32 {.device, forceinline.} =
+    var res: uint32
+    asm """
+"sub.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc sub_bi(a, b: uint32): uint32 {.device, forceinline.} =
+    var res: uint32
+    asm """
+"subc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc sub_bio(a, b: uint32): uint32 {.device, forceinline.} =
+    var res: uint32
+    asm """
+"subc.cc.u32 %0, %1, %2;" : "=r"(res) : "r"(a), "r"(b)
+"""
+    return res
+
+  proc slct(a, b: uint32, pred: uint32): uint32 {.device, forceinline.} =
+    var res: uint32
+# "slct.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(pred)
+    asm """
+"slct.u32.s32 %0, %1, %2, %3;" : "=r"(res) : "r"(a), "r"(b), "r"(pred)
+"""
+    return res
+
+  proc finalSubMayOverflow(a, M: BigInt): BigInt {.device.} =
+    ## If a >= Modulus: r <- a-M
+    ## else:            r <- a
+    ##
+    ## This is constant-time straightline code.
+    ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
+    ##
+    ## To be used when the final substraction can
+    ## also overflow the limbs (a 2^256 order of magnitude modulus stored in n words of total max size 2^256)
+    let N = a.len
+    var scratch: BigInt
+
+    # Contains 0x0001 (if overflowed limbs) or 0x0000
+    let overflowedLimbs = add_ci(0'u32, 0'u32)
+
+    # Now substract the modulus, and test a < M with the last borrow
+    scratch[0] = sub_bo(a[0], M[0])
+    for i in 1 ..< N:
+      scratch[i] = sub_bio(a[i], M[i])
+
+    # 1. if `overflowedLimbs`, underflowedModulus >= 0
+    # 2. if a >= M, underflowedModulus >= 0
+    # if underflowedModulus >= 0: a-M else: a
+    # TODO: predicated mov instead?
+    let underflowedModulus = sub_bi(overflowedLimbs, 0'u32)
+
+    var r: BigInt
+    for i in 0 ..< N:
+      r[i] = slct(scratch[i], a[i], underflowedModulus)
+    return r
+
+  proc finalSubNoOverflow(a, M: BigInt): BigInt {.device.} =
+    ## If a >= Modulus: r <- a-M
+    ## else:            r <- a
+    ##
+    ## This is constant-time straightline code.
+    ## Due to warp divergence, the overhead of doing comparison with shortcutting might not be worth it on GPU.
+    ##
+    ## To be used when the modulus does not use the full bitwidth of the storing words
+    ## (say using 255 bits for the modulus out of 256 available in words)
+    let N = a.len
+    var scratch: BigInt
+
+    # Now substract the modulus, and test a < M with the last borrow
+    scratch[0] = sub_bo(a[0], M[0])
+    for i in 1 ..< N:
+      scratch[i] = sub_bio(a[i], M[i])
+
+    # If it underflows here, `a` was smaller than the modulus, which is what we want
+    let underflowedModulus = sub_bi(0'u32, 0'u32)
+
+    var r: BigInt
+    for i in 0 ..< N:
+      r[i] = slct(scratch[i], a[i], underflowedModulus)
+    return r
+
+  proc modadd(a, b, M: BigInt): BigInt {.device.} =
+    # try to add two bigints
+    let N = a.len
+    #var res: BigInt[N]
+    var res: BigInt
+
+    var t: BigInt # temporary
+
+    t[0] = add_co(a[0], b[0])
+    for i in 1 ..< N:
+      t[i] = add_cio(a[i], b[i])
+      printf("element i %d = %d\n", i, t[i])
+
+    # can use `when` of course!
+    when spareBits() >= 1: # if spareBits() >= 1: # would also work
+      t = finalSubNoOverflow(t, M)
+    else:
+      t = finalSubMayOverflow(t, M)
+
+    return t
+
+  #proc bigintTest(output: ptr UncheckedArray[uint32], aIn, bIn: ptr BigInt) {.global.} =
+  proc bigintTest(output: ptr UncheckedArray[uint32], a, b: BigInt) {.global.} =
+    # Get global thread ID for example
+    let tid = blockIdx.x * blockDim.x + threadIdx.x
+    # or warp ID and lane ID
+    let warp_id = threadIdx.x div 32
+    let lane_id = threadIdx.x mod 32
+
+    ## Example: Construct via static array would work:
+    #let b = BigInt(limbs: [1'u32, 2, 3, 4, 5, 6, 7, 8])
+
+    ## If the bigints are passed as ptrs:
+    #let a = aIn[]
+    #let b = bIn[]
+    for i in 0 ..< 8: # print an input
+      printf("b: %d = %u\n", i, b[i])
+
+    let M64 = getFieldModulus() # need a let variable, otherwise modulus does not have an address
+    # Cast the 64bit limbs of field modulus to 32bit limbs to copy
+    var data = cast[ptr UncheckedArray[uint32]](addr M64[0])
+    ## NOTE: you cannot do `BigInt(limbs: data)`. Leads to invalid C/CUDA code. We might turn calls
+    ## like that into memcpy in the future.
+    var M = BigInt()
+    ## Or copy data from a runtime array
+    for i in 0 ..< 8:
+      M[i] = data[i]
+    ## also works of course (in which case cast is not needed)
+    #memcpy(addr M[0], addr M64[0], sizeof(M64))
+
+    for i in 0 ..< 4: # let's print M64 as uint64 values
+      printf("M64: %d = %llu\n", i, M64[i])
+    for i in 0 ..< 8: # let's print m64 as 32 bit data and assigned
+      printf("M: %d = %u\n", i, M[i])
+      printf("M64 as 32: %d = %u\n", i, data[i])
+
+    # Call `modadd` and assign to result
+    let res = modadd(a, b, M)
+    for i in 0 ..< b.len:
+      output[i] = res[i]
+
+proc getBigints(): (Fp[BN254_Snarks], Fp[BN254_Snarks]) =
+  # return some bigint values of a finite field
+  let a = Fp[BN254_Snarks].fromUInt(1'u32)
+  let b = Fp[BN254_Snarks].fromHex("0x2beb0d0d6115007676f30bcc462fe814bf81198848f139621a3e9fa454fe8e6a")
+
+  result = (a, b)
+
+proc main =
+  var nvrtc = initNvrtc(BigIntExample)
+  # echo the generated CUDA code
+  echo BigIntExample
+
+  nvrtc.compile()
+  nvrtc.getPtx()
+
+  var hOut: array[8, uint32] # storage for the output limbs (could also be a `Fp[BN254_Snarks]` instead)
+  nvrtc.numBlocks = 1
+  nvrtc.threadsPerBlock = 1
+
+  let (a, b) = getBigInts()
+
+  ## If arguments were `prt BigInt`:
+  #nvrtc.execute("bigintTest", (hOut), (addr a, addr b))
+  # for regular `BigInt` arguments
+  nvrtc.execute("bigintTest", (hOut), (a, b))
+
+  ## Compare with expected
+  let exp = a + b
+  # Get expected as array of 8 uint32
+  let expU32: array[8, uint32] = cast[ptr array[8, uint32]](exp.mres.limbs[0].addr)[]
+
+  # both arrays must match
+  doAssert hOut == expU32
+
+  # now compare as field elements
+  # Things to note:
+  # - the `modadd` `hOut` data is in Montgomery representation
+  # - we need to convert `array[8, uint32]` into `array[32, byte]`
+  #   to unmarshal into a `BigInt[254]`
+  # - need to undo Montgomery representation on the `BigInt[254]` before
+  #   constructing the finite field element
+  var res: Fp[BN254_Snarks]
+  var resBI: matchingBigInt(BN254_Snarks)
+  var hOutBytes: array[32, byte]
+  hOutBytes.marshal(hOut, 32, littleEndian) # convert to 32 bytes
+  resBI.unmarshal(hOutBytes, littleEndian)  # convert bytes to BigInt[254]
+  type T = Fp[BN254_Snarks]
+  # undo Montgomery representation
+  resBI.fromMont(resBI, T.getModulus(), T.getNegInvModWord(), T.getSpareBits())
+  res.fromBig(resBI)                        # convert `BigInt[254]` to finite field element
+
+  doAssert bool(res == exp)
+
+when isMainModule:
+  main()
diff --git a/tests/gpu/t_nvrtc_inbuilt_modadd.nim b/tests/gpu/t_nvrtc_inbuilt_modadd.nim
new file mode 100644
index 000000000..bdf8f87bd
--- /dev/null
+++ b/tests/gpu/t_nvrtc_inbuilt_modadd.nim
@@ -0,0 +1,135 @@
+import std / strformat
+
+import
+  # Internal
+  constantine/named/algebras,
+  constantine/math/io/[io_bigints, io_fields],
+  constantine/math/arithmetic,
+  constantine/platforms/abstractions {.all.},
+  constantine/platforms/abis/nvidia_abi,
+  constantine/math_compiler/experimental/runtime_compile,
+  constantine/serialization/io_limbs
+
+import constantine/math_compiler/experimental/nvrtc_field_ops
+
+const N = 8
+type T = Fp[BN254_Snarks]
+const WordSize = 32
+const BigIntExample* = cuda:
+
+  defBigInt(N)
+  defPtxHelpers()
+  defCoreFieldOps(T)
+
+  #proc bigintTest(output: ptr UncheckedArray[uint32], aIn, bIn: ptr BigInt) {.global.} =
+  proc modaddTest(output: ptr UncheckedArray[uint32], a, b: BigInt) {.global.} =
+    let M64 = getFieldModulus() # need a let variable, otherwise modulus does not have an address
+    # Cast the 64bit limbs of field modulus to 32bit limbs to copy
+    var data = cast[ptr UncheckedArray[uint32]](addr M64[0])
+    ## NOTE: you cannot do `BigInt(limbs: data)`. Leads to invalid C/CUDA code. We might turn calls
+    ## like that into memcpy in the future.
+    var M = BigInt()
+    ## Or copy data from a runtime array
+    for i in 0 ..< 8:
+      M[i] = data[i]
+    ## also works of course (in which case cast is not needed)
+    #memcpy(addr M[0], addr M64[0], sizeof(M64))
+
+    # Call `modadd` and assign to result
+    let res = modadd(a, b, M)
+    for i in 0 ..< b.len:
+      output[i] = res[i]
+
+  proc modsubTest(output: ptr UncheckedArray[uint32], a, b: BigInt) {.global.} =
+    let M64 = getFieldModulus() # need a let variable, otherwise modulus does not have an address
+    var M = BigInt()
+    ## TODO: avoid this memcopy
+    memcpy(addr M[0], addr M64[0], sizeof(M64))
+
+    # Call `modadd` and assign to result
+    let res = modsub(a, b, M)
+    for i in 0 ..< b.len:
+      output[i] = res[i]
+
+  proc mtymulTest(output: ptr UncheckedArray[uint32], a, b: BigInt) {.global.} =
+    let M64 = getFieldModulus() # need a let variable, otherwise modulus does not have an address
+    var M = BigInt()
+    ## TODO: avoid this memcopy
+    memcpy(addr M[0], addr M64[0], sizeof(M64))
+
+    # Call `modadd` and assign to result
+    let res = mtymul_CIOS_sparebit(a, b, M, true)
+    #let res = mtymul_CIOS_concise(a, b, M, true)
+    for i in 0 ..< b.len:
+      output[i] = res[i]
+
+
+proc getBigints(): (Fp[BN254_Snarks], Fp[BN254_Snarks]) =
+  # return some bigint values of a finite field
+  let a = Fp[BN254_Snarks].fromUInt(1'u32)
+  let b = Fp[BN254_Snarks].fromHex("0x2beb0d0d6115007676f30bcc462fe814bf81198848f139621a3e9fa454fe8e6a")
+  #let b = Fp[BN254_Snarks].fromUint(1'u32)
+
+  result = (a, b)
+
+template checkOp(kernel, exp, hOut, a, b: untyped): untyped =
+  nvrtc.execute(kernel, (hOut), (a, b))
+
+  ## Compare with expected
+  # Get expected as array of 8 uint32
+  let expU32: array[8, uint32] = cast[ptr array[8, uint32]](exp.mres.limbs[0].addr)[]
+
+  # both arrays must match
+  #doAssert hOut == expU32
+  # now compare as field elements
+  # Things to note:
+  # - the `modadd` `hOut` data is in Montgomery representation
+  # - we need to convert `array[8, uint32]` into `array[32, byte]`
+  #   to unmarshal into a `BigInt[254]`
+  # - need to undo Montgomery representation on the `BigInt[254]` before
+  #   constructing the finite field element
+  var res: Fp[BN254_Snarks]
+  var resBI: matchingBigInt(BN254_Snarks)
+  var hOutBytes: array[32, byte]
+  hOutBytes.marshal(hOut, 32, littleEndian) # convert to 32 bytes
+  resBI.unmarshal(hOutBytes, littleEndian)  # convert bytes to BigInt[254]
+  type T = Fp[BN254_Snarks]
+  # undo Montgomery representation
+  resBI.fromMont(resBI, T.getModulus(), T.getNegInvModWord(), T.getSpareBits())
+  res.fromBig(resBI)                        # convert `BigInt[254]` to finite field element
+
+  echo "Res = ", res.toHex()
+  echo "Exp = ", exp.toHex()
+  doAssert bool(res == exp)
+
+proc main =
+  var nvrtc = initNvrtc(BigIntExample)
+  # echo the generated CUDA code
+  echo BigIntExample
+
+  nvrtc.compile()
+  nvrtc.getPtx()
+
+  var hOut: array[8, uint32] # storage for the output limbs (could also be a `Fp[BN254_Snarks]` instead)
+  nvrtc.numBlocks = 1
+  nvrtc.threadsPerBlock = 1
+
+  let (a, b) = getBigInts()
+
+  ## If arguments were `prt BigInt`:
+  #nvrtc.execute("bigintTest", (hOut), (addr a, addr b))
+  # for regular `BigInt` arguments
+
+  echo "M0NINV::: ", Fp[BN254_Snarks].getModulus().negInvModWord()
+  echo "M0NINV::: ", Fp[BN254_Snarks].getModulus().negInvModWord().sizeof()
+
+  let exp1 = a + b
+  checkOp("modaddTest", exp1, hOut, a, b)
+  let exp2 = a - b
+  checkOp("modsubTest", exp2, hOut, a, b)
+  let exp3 = a * b
+  checkOp("mtymulTest", exp3, hOut, a, b)
+
+
+when isMainModule:
+  main()
diff --git a/tests/gpu/t_nvrtc_misc_ops.nim b/tests/gpu/t_nvrtc_misc_ops.nim
new file mode 100644
index 000000000..bea3943e0
--- /dev/null
+++ b/tests/gpu/t_nvrtc_misc_ops.nim
@@ -0,0 +1,379 @@
+import std / strformat
+
+import
+  # Internal
+  constantine/named/algebras,
+  constantine/math/io/[io_bigints, io_fields],
+  constantine/math/arithmetic,
+  constantine/math/arithmetic/bigints,
+  constantine/platforms/abstractions {.all.},
+  constantine/platforms/abis/nvidia_abi,
+  constantine/math_compiler/experimental/runtime_compile,
+  constantine/serialization/io_limbs
+
+import constantine/math_compiler/experimental/nvrtc_field_ops
+
+proc toFp[Name: static Algebra](FF: type Fp[Name], ar: array[8, uint32]): Fp[Name] =
+  var resBI: matchingBigInt(BN254_Snarks)
+  var arBytes: array[32, byte]
+  arBytes.marshal(ar, 32, littleEndian) # convert to 32 bytes
+  resBI.unmarshal(arBytes, littleEndian)  # convert bytes to BigInt[254]
+  # undo Montgomery representation
+  resBI.fromMont(resBI, FF.getModulus(), FF.getNegInvModWord(), FF.getSpareBits())
+  result.fromBig(resBI)                        # convert `BigInt[254]` to finite field element
+
+const N = 8
+type T = Fp[BN254_Snarks]
+const WordSize = 32
+const BigIntExample* = cuda:
+
+  defBigInt(N)
+  defPtxHelpers()
+  defCoreFieldOps(T)
+
+  proc testPassBigInt(output: ptr BigInt) {.global.} =
+    # Call `modadd` and assign to result
+    for i in 0 ..< 8:
+      output[i] = i.uint32
+
+  proc testSetZero(output: ptr BigInt) {.global.} =
+    output[].setZero()
+
+  proc testSetOne(output: ptr BigInt) {.global.} =
+    output[].setOne()
+
+  proc testAdd(output: ptr BigInt, a, b: BigInt) {.global.} =
+    output[].add(a, b)
+
+  proc testSub(output: ptr BigInt, a, b: BigInt) {.global.} =
+    output[].sub(a, b)
+
+  proc testMul(output: ptr BigInt, a, b: BigInt) {.global.} =
+    output[].mul(a, b)
+
+  proc testCcopy(output: ptr BigInt, a, b: BigInt, c: bool) {.global.} =
+    output[] = a
+    output[].ccopy(b, c)
+
+  proc testCsetOne(output: ptr BigInt, a: BigInt, c: bool) {.global.} =
+    output[] = a
+    output[].cSetOne(c)
+
+  proc testCsetZero(output: ptr BigInt, a, b: BigInt, c: uint32) {.global.} =
+    output[] = a
+    output[].cSetZero(c.bool)
+
+  proc testCadd(output: ptr BigInt, a, b: BigInt, c: bool) {.global.} =
+    output[] = a
+    output[].cadd(b, c)
+
+  proc testCsub(output: ptr BigInt, a, b: BigInt, c: bool) {.global.} =
+    output[] = a
+    output[].csub(b, c)
+
+  proc testDouble(output: ptr BigInt, a: BigInt) {.global.} =
+    output[].doubleElement(a)
+
+  proc testNsqr(output: ptr BigInt, a: BigInt, count: int) {.global.} =
+    output[].nsqr(a, count)
+
+  proc testIsZero(output: ptr bool, a: BigInt) {.global.} =
+    output[].isZero(a)
+
+  proc testIsOdd(output: ptr bool, a: BigInt) {.global.} =
+    output[].isOdd(a)
+
+  proc testNeg(output: ptr BigInt, a: BigInt) {.global.} =
+    output[].neg(a)
+
+  proc testCneg(output: ptr BigInt, a: BigInt, c: bool) {.global.} =
+    output[].cneg(a, c)
+
+  proc testShiftRight(output: ptr BigInt, a: BigInt, k: uint32) {.global.} =
+    output[] = a
+    output[].shiftRight(k)
+
+  proc testDiv2(output: ptr BigInt, a: BigInt) {.global.} =
+    output[] = a
+    output[].div2()
+
+from std / sequtils import mapIt
+proc main =
+  var nvrtc = initNvrtc(BigIntExample)
+  # echo the generated CUDA code
+  echo BigIntExample
+  writeFile("/tmp/kernel.cu", BigIntExample)
+
+  nvrtc.compile()
+  nvrtc.getPtx()
+
+  var hOut: array[8, uint32] # storage for the output limbs (could also be a `Fp[BN254_Snarks]` instead)
+  nvrtc.numBlocks = 1
+  nvrtc.threadsPerBlock = 1
+
+  block PassBigInt:
+    hOut.reset()
+    nvrtc.execute("testPassBigInt", (hOut), ())
+    let exp = [0'u32, 1, 2, 3, 4, 5, 6, 7]
+    for i in 0 ..< 8:
+      doAssert exp[i] == hOut[i]
+
+  block SetZero:
+    # now use `setZero` to reset to zero
+    # `hOut` should be zero, but let's change some numbers
+    hOut.reset()
+    hOut[0] = 123
+    hOut[5] = 321
+    nvrtc.execute("testSetZero", (hOut), ())
+    let exp = [0'u32, 0, 0, 0, 0, 0, 0, 0]
+    for i in 0 ..< 8:
+      doAssert exp[i] == hOut[i]
+
+  block SetOne:
+    # and `setOne` to set to Montgomery representation of 1
+    hOut.reset()
+    nvrtc.execute("testSetOne", (hOut), ())
+    let expFp = T.fromUInt(1'u32)
+    doAssert bool(expFp == toFp(T, hOut))
+
+  block Add:
+    # add one and one
+    hOut.reset()
+    let inFp = T.fromUInt(1'u32)
+    nvrtc.execute("testAdd", (hOut), (inFp, inFp)) # inputs 1 and 1
+    let expFp = T.fromUInt(2'u32)
+    doAssert bool(expFp == toFp(T, hOut))
+
+  block Sub:
+    hOut.reset()
+    let inFp = T.fromUInt(1'u32)
+    nvrtc.execute("testSub", (hOut), (inFp, inFp)) # inputs 1 and 1
+    let expFp1 = T.fromUInt(0'u32)
+    doAssert bool(expFp1 == toFp(T, hOut))
+
+    hOut.reset()
+    let inFp1 = T.fromUInt(5'u32)
+    let inFp2 = T.fromUInt(2'u32)
+    nvrtc.execute("testSub", (hOut), (inFp1, inFp2)) # inputs 1 and 1
+    let expFp2 = T.fromUInt(3'u32)
+    doAssert bool(expFp2 == toFp(T, hOut))
+
+  block Mul:
+    # mul 2 and 2
+    hOut.reset()
+    let inFp = T.fromUInt(2'u32)
+    nvrtc.execute("testMul", (hOut), (inFp, inFp)) # inputs 2 and 2
+    let expFp = T.fromUInt(4'u32)
+    doAssert bool(expFp == toFp(T, hOut))
+
+  block Ccopy:
+    # ccopy based on true (false means we do not copy b into a)
+    hOut.reset()
+    let one = T.fromUInt(1'u32)
+    let two = T.fromUInt(2'u32)
+    nvrtc.execute("testCcopy", (hOut), (one, two, false)) # inputs 1 and 2
+    doAssert bool(one == toFp(T, hOut))
+    # and based on false (true means we do copy b into a)
+    nvrtc.execute("testCcopy", (hOut), (one, two, true)) # inputs 1 and 2
+    doAssert bool(two == toFp(T, hOut))
+
+  block CsetZero:
+    hOut.reset()
+    hOut[0] = 123
+    hOut[5] = 321
+    var input: array[8, uint32]
+    input = hOut
+    nvrtc.execute("testCsetZero", (hOut), (input, input, 0'u32))
+    let expF = [123'u32, 0, 0, 0, 0, 321, 0, 0]
+    for i in 0 ..< 8:
+      doAssert expF[i] == hOut[i]
+
+    nvrtc.execute("testCsetZero", (hOut), (input, 1'u32))
+    let expT = [0'u32, 0, 0, 0, 0, 0, 0, 0]
+    for i in 0 ..< 8:
+      doAssert expT[i] == hOut[i]
+
+  block CsetOne:
+    hOut.reset()
+    hOut[0] = 123
+    hOut[5] = 321
+    var input: array[8, uint32]
+    input = hOut
+    nvrtc.execute("testCsetOne", (hOut), (input, false))
+    let expF = [123'u32, 0, 0, 0, 0, 321, 0, 0]
+    for i in 0 ..< 8:
+      doAssert expF[i] == hOut[i]
+
+    nvrtc.execute("testCsetOne", (hOut), (input, true))
+    let expT = T.fromUInt(1'u32)
+    doAssert bool(expT == toFp(T, hOut))
+
+  block Cadd:
+    # add one and one
+    hOut.reset()
+    let inFp = T.fromUInt(1'u32)
+    nvrtc.execute("testCadd", (hOut), (inFp, inFp, false)) # inputs 1 and 1
+    let expF = T.fromUInt(1'u32)
+    doAssert bool(expF == toFp(T, hOut))
+
+    hOut.reset()
+    nvrtc.execute("testCadd", (hOut), (inFp, inFp, true)) # inputs 1 and 1
+    let expT = T.fromUInt(2'u32)
+    doAssert bool(expT == toFp(T, hOut))
+
+  block Csub:
+    hOut.reset()
+    # add one and one
+    let inFp1 = T.fromUInt(5'u32)
+    let inFp2 = T.fromUInt(2'u32)
+    nvrtc.execute("testCsub", (hOut), (inFp1, inFp2, false))
+    let expF = T.fromUInt(5'u32)
+    doAssert bool(expF == toFp(T, hOut))
+
+    hOut.reset()
+    nvrtc.execute("testCsub", (hOut), (inFp1, inFp2, true))
+    let expT = T.fromUInt(3'u32)
+    doAssert bool(expT == toFp(T, hOut))
+
+  block DoubleElement:
+    # add one and one
+    hOut.reset()
+    let inFp = T.fromUInt(6'u32)
+    nvrtc.execute("testDouble", (hOut), (inFp))
+    let exp = T.fromUInt(12'u32)
+    doAssert bool(exp == toFp(T, hOut))
+
+  block Nsqr:
+    # add one and one
+    hOut.reset()
+    let inFp = T.fromUInt(2'u32)
+    nvrtc.execute("testNsqr", (hOut), (inFp, 2))
+    let exp1 = T.fromUInt(16'u32)
+    doAssert bool(exp1 == toFp(T, hOut))
+
+    hOut.reset()
+    nvrtc.execute("testNsqr", (hOut), (inFp, 4))
+    let exp2 = T.fromUInt(65536'u32)
+    doAssert bool(exp2 == toFp(T, hOut))
+
+  block IsZero:
+    hOut.reset()
+    let inFp1 = T.fromUInt(132'u32)
+    var res: bool
+    nvrtc.execute("testIsZero", (res), (inFp1))
+    doAssert res == false
+
+    hOut.reset()
+    let inFp2 = T.fromUInt(0'u32)
+    nvrtc.execute("testIsZero", (res), (inFp2))
+    doAssert res == true
+
+    hOut.reset()
+    var inFp3: array[8, uint32] # zero initialized
+    nvrtc.execute("testIsZero", (res), (inFp3))
+    doAssert res == true
+
+  block IsOdd:
+    hOut.reset()
+    var inp: array[8, uint32]
+    inp[0] = 2 # even
+    var res: bool
+    nvrtc.execute("testIsOdd", (res), (inp))
+    doAssert res == false
+
+    hOut.reset()
+    inp[0] = 0 # even
+    inp[5] = 555
+    nvrtc.execute("testIsOdd", (res), (inp))
+    doAssert res == false
+
+    hOut.reset()
+    inp[0] = 123 # odd
+    nvrtc.execute("testIsOdd", (res), (inp))
+    doAssert res == true
+
+  block Neg:
+    hOut.reset()
+    let inFp1 = T.fromUInt(2'u32)
+    nvrtc.execute("testNeg", (hOut), (inFp1))
+    var exp = inFp1
+    exp.neg()
+    doAssert bool(exp == toFp(T, hOut))
+
+    hOut.reset()
+    let inFp2 = T.fromUInt(123547'u32)
+    nvrtc.execute("testNeg", (hOut), (inFp2))
+    exp = inFp2
+    exp.neg()
+    doAssert bool(exp == toFp(T, hOut))
+
+  block CNeg:
+    hOut.reset()
+    let inFp1 = T.fromUInt(2'u32)
+    nvrtc.execute("testCneg", (hOut), (inFp1, false))
+    doAssert bool(inFp1 == toFp(T, hOut))
+
+    hOut.reset()
+    nvrtc.execute("testCneg", (hOut), (inFp1, true))
+    var exp1 = inFp1
+    exp1.neg()
+    doAssert bool(exp1 == toFp(T, hOut))
+
+    hOut.reset()
+    let inFp2 = T.fromUInt(123547'u32)
+    nvrtc.execute("testCneg", (hOut), (inFp2, false))
+    doAssert bool(inFp2 == toFp(T, hOut))
+
+    hOut.reset()
+    nvrtc.execute("testCneg", (hOut), (inFp2, true))
+    var exp2 = inFp2
+    exp2.neg()
+    doAssert bool(exp2 == toFp(T, hOut))
+
+  block ShiftRight:
+    #let inFp1 = T.fromUInt(8'u32)
+    hOut.reset()
+    var inp: array[8, uint32]
+    inp[0] = 8
+    nvrtc.execute("testShiftRight", (hOut), (inp, 2))
+    var exp: array[8, uint32]
+    exp[0] = 2
+    doAssert exp == hOut
+
+    hOut.reset()
+    nvrtc.execute("testShiftRight", (hOut), (inp, 3))
+    exp[0] = 1
+    doAssert exp == hOut
+
+    hOut.reset()
+    inp[0] = 15
+    nvrtc.execute("testShiftRight", (hOut), (inp, 1))
+    var exp3 = matchingBigInt(T.Name).fromUInt(15'u32)
+    exp3.shiftRight(1)
+    doAssert exp3.limbs.mapIt(it.uint32) == hOut.mapIt(it.uint32)
+
+  block Div2:
+    hOut.reset()
+    let inFp1 = T.fromUInt(8'u32)
+    nvrtc.execute("testDiv2", (hOut), (inFp1))
+    let exp1 = T.fromUInt(4'u32)
+    var expAF = inFp1
+    expAF.div2()
+    doAssert bool(exp1 == toFp(T, hOut))
+
+    hOut.reset()
+    let inFp2 = T.fromUInt(4096'u32)
+    nvrtc.execute("testDiv2", (hOut), (inFp2))
+    let exp2 = T.fromUInt(2048'u32)
+    doAssert bool(exp2 == toFp(T, hOut))
+
+    hOut.reset()
+    let inFp3 = T.fromUInt(15'u32)
+    nvrtc.execute("testDiv2", (hOut), (inFp3))
+    var exp3 = T.fromUInt(15'u32)
+    exp3.div2()
+    doAssert bool(exp3 == toFp(T, hOut))
+
+
+when isMainModule:
+  main()
diff --git a/tests/gpu/t_nvrtc_pass_pointer.nim b/tests/gpu/t_nvrtc_pass_pointer.nim
new file mode 100644
index 000000000..8a0e8dc57
--- /dev/null
+++ b/tests/gpu/t_nvrtc_pass_pointer.nim
@@ -0,0 +1,57 @@
+import std / strformat
+
+import
+  # Internal
+  constantine/named/algebras,
+  constantine/math/io/[io_bigints, io_fields],
+  constantine/math/arithmetic,
+  constantine/platforms/abstractions {.all.},
+  constantine/platforms/abis/nvidia_abi,
+  constantine/math_compiler/experimental/runtime_compile,
+  constantine/serialization/io_limbs
+
+import constantine/math_compiler/experimental/nvrtc_field_ops
+
+const N = 8
+type T = Fp[BN254_Snarks]
+const WordSize = 32
+const BigIntExample* = cuda:
+
+  defBigInt(N)
+  defPtxHelpers()
+  defCoreFieldOps()
+
+  proc testPointer(x: ptr BigInt) {.device.} =
+    ## just write data to the bigint
+    for i in 0 ..< 8:
+      x[][i] = i.uint32
+
+  proc test(output: ptr UncheckedArray[uint32]) {.global.} =
+    # Call `modadd` and assign to result
+
+    var t = BigInt()
+
+    testPointer(t.addr)
+
+    for i in 0 ..< 8:
+      output[i] = t[i]
+
+proc main =
+  var nvrtc = initNvrtc(BigIntExample)
+  # echo the generated CUDA code
+  echo BigIntExample
+
+  nvrtc.compile()
+  nvrtc.getPtx()
+
+  var hOut: array[8, uint32] # storage for the output limbs (could also be a `Fp[BN254_Snarks]` instead)
+  nvrtc.numBlocks = 1
+  nvrtc.threadsPerBlock = 1
+
+  nvrtc.execute("test", (hOut), ())
+  for i in 0 ..< 8:
+    echo hOut[i]
+
+
+when isMainModule:
+  main()
diff --git a/tests/gpu/t_nvrtc_pass_var.nim b/tests/gpu/t_nvrtc_pass_var.nim
new file mode 100644
index 000000000..a7e0dcfd6
--- /dev/null
+++ b/tests/gpu/t_nvrtc_pass_var.nim
@@ -0,0 +1,57 @@
+import std / strformat
+
+import
+  # Internal
+  constantine/named/algebras,
+  constantine/math/io/[io_bigints, io_fields],
+  constantine/math/arithmetic,
+  constantine/platforms/abstractions {.all.},
+  constantine/platforms/abis/nvidia_abi,
+  constantine/math_compiler/experimental/runtime_compile,
+  constantine/serialization/io_limbs
+
+import constantine/math_compiler/experimental/nvrtc_field_ops
+
+const N = 8
+type T = Fp[BN254_Snarks]
+const WordSize = 32
+const BigIntExample* = cuda:
+
+  defBigInt(N)
+  defPtxHelpers()
+  defCoreFieldOps()
+
+  proc testPointer(x: var BigInt) {.device.} =
+    ## just write data to the bigint
+    for i in 0 ..< 8:
+      x[i] = i.uint32
+
+  proc test(output: ptr UncheckedArray[uint32]) {.global.} =
+    # Call `modadd` and assign to result
+
+    var t = BigInt()
+
+    testPointer(t)
+
+    for i in 0 ..< 8:
+      output[i] = t[i]
+
+proc main =
+  var nvrtc = initNvrtc(BigIntExample)
+  # echo the generated CUDA code
+  echo BigIntExample
+
+  nvrtc.compile()
+  nvrtc.getPtx()
+
+  var hOut: array[8, uint32] # storage for the output limbs (could also be a `Fp[BN254_Snarks]` instead)
+  nvrtc.numBlocks = 1
+  nvrtc.threadsPerBlock = 1
+
+  nvrtc.execute("test", (hOut), ())
+  for i in 0 ..< 8:
+    echo hOut[i]
+
+
+when isMainModule:
+  main()