diff --git a/Project.toml b/Project.toml index b7848d38455..626e5d5f128 100644 --- a/Project.toml +++ b/Project.toml @@ -58,8 +58,10 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Convex = "f65535da-76fb-5f13-bab9-19810c17039a" ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199" +IntelITT = "c9b2f978-7543-4802-ae44-75068f23ee64" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56" +NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5" @@ -67,8 +69,10 @@ SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5" TrixiAMDGPUExt = "AMDGPU" TrixiCUDAExt = "CUDA" TrixiConvexECOSExt = ["Convex", "ECOS"] +TrixiIntelITTExt = "IntelITT" TrixiMakieExt = "Makie" TrixiNLsolveExt = "NLsolve" +TrixiNVTXExt = ["NVTX", "CUDA"] TrixiPlotsExt = "Plots" TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer" @@ -91,6 +95,7 @@ EllipsisNotation = "1.0" FillArrays = "1.9" ForwardDiff = "0.10.38, 1" HDF5 = "0.17" +IntelITT = "0.2" KernelAbstractions = "0.9.38" LinearAlgebra = "1" LinearMaps = "2.7, 3.0" @@ -99,6 +104,7 @@ MPI = "0.20.23" Makie = "0.22, 0.23, 0.24" MuladdMacro = "0.2.4" NLsolve = "4.5.1" +NVTX = "1.0.0" Octavian = "0.3.28" OffsetArrays = "1.13" P4est = "0.4.12" diff --git a/benchmark/CUDA/Project.toml b/benchmark/CUDA/Project.toml index 2e9f130fe6c..1bdbed2467b 100644 --- a/benchmark/CUDA/Project.toml +++ b/benchmark/CUDA/Project.toml @@ -1,6 +1,7 @@ [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb" @@ -11,6 +12,7 @@ Trixi = {path = "../.."} [compat] CUDA = "5.8.2" JSON = "1.4.0" +NVTX = "1" OrdinaryDiffEqLowStorageRK = "1.12.0" TimerOutputs = "0.5.25" Trixi = "0.16" diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl index b9d02246c9b..89907cb9e9b 100644 --- a/benchmark/CUDA/run.jl +++ b/benchmark/CUDA/run.jl @@ -1,5 +1,6 @@ using Trixi using CUDA +using NVTX # Load to get tracing support for Trixi using TimerOutputs using JSON diff --git a/docs/src/performance.md b/docs/src/performance.md index 3d4ff8fdfce..a147b26bf59 100644 --- a/docs/src/performance.md +++ b/docs/src/performance.md @@ -288,3 +288,45 @@ requires. It can thus be seen as a proxy for "energy used" and, as an extension, timing result, you need to set the analysis interval such that the `AnalysisCallback` is invoked at least once during the course of the simulation and discard the first PID value. + +## Tracing support for profilers + +Trixi.jl supports tracing profiler integration through [ittapi](https://github.com/intel/ittapi) for Intel VTune and [NVTX](https://github.com/NVIDIA/NVTX) for [NVIDIA Nsight Systems](https://developer.nvidia.com/nsight-systems). + +!!! note "Extensions" + Tracing support is implemented through extensions and requires trigger packages to be loaded. + +Tracing support is only available for regions that are instrumented with [`@trixi_timeit_ext`](@ref). + +### Using Intel VTune + +We can use Intel VTune to profile CPU code. For more information see the [Julia documentation](https://docs.julialang.org/en/v1/manual/profile/#External-Profiling) and the [IntelITT.jl](https://github.com/JuliaPerf/IntelITT.jl) package. + +!!! note "Trigger package" + ```julia + using IntelITT + ``` + +To get the most out of Intel VTune we recommend passing the environment flag `ENABLE_JITPROFILING=1` to Julia, which will allow you to symbolize JIT compiled call frames. +Otherwise, instead of the Julia function names you will only see anonymous C function calls in the trace. + +!!! note "Usage of `juliaup`" + Sometime `juliaup` can make it harder for a profiler to attach to the right process. You can use `Base.julia_cmd()` in the REPL to obtain the path to the actual Julia binary you will be running. + + +### NVIDIA Nsight Systems + +We can use NVIDIA Nsight Systems to trace GPU. + +We recommend reading the CUDA.jl documentation on using [Nsight Systems](https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems) + +!!! note "Trigger package" + ```julia + using CUDA + using NVTX + ``` + +You can also just use `CUDA.@profile` (see [Integrated Profiler](https://cuda.juliagpu.org/stable/development/profiling/#Integrated-profiler)) to obtain profiler results that include the NVTX ranges. + +#### Known limitation +Nsight Systems can also be used for CPU and in particular MPI codes. The Trixi.jl extension will only be enabled when GPU backend is being used. diff --git a/ext/TrixiIntelITTExt.jl b/ext/TrixiIntelITTExt.jl new file mode 100644 index 00000000000..51e2f9ec752 --- /dev/null +++ b/ext/TrixiIntelITTExt.jl @@ -0,0 +1,30 @@ +module TrixiIntelITTExt + +# This extension provides tracing profiler integration for Intel VTune via IntelITT.jl. + +using Trixi: CPU +import Trixi: profiling_range_active, profiling_range_start, profiling_range_end + +import IntelITT + +const domain = Ref{IntelITT.Domain}() +function __init__() + domain[] = IntelITT.Domain("Trixi") +end + +function profiling_range_active(::Union{Nothing, CPU}) + return IntelITT.isactive() +end + +function profiling_range_start(::Union{Nothing, CPU}, label) + task = IntelITT.Task(domain[], label) + IntelITT.start(task) + return task +end + +function profiling_range_end(::Union{Nothing, CPU}, id) + IntelITT.stop(id) + return nothing +end + +end diff --git a/ext/TrixiNVTXExt.jl b/ext/TrixiNVTXExt.jl new file mode 100644 index 00000000000..8125b31400b --- /dev/null +++ b/ext/TrixiNVTXExt.jl @@ -0,0 +1,27 @@ +module TrixiNVTXExt + +# This extension provides tracing profiler integration for NVIDIA Nsight Systems via NVTX.jl. + +using NVTX +using CUDA: CUDABackend +import Trixi: profiling_range_active, profiling_range_start, profiling_range_end + +# One can also use Nsight Systems and thus NVTX for CPU code + +const domain = NVTX.Domain("Trixi") +const color = 0xff40e0d0 # turquoise + +function profiling_range_active(::CUDABackend) + return NVTX.isactive() +end + +function profiling_range_start(::CUDABackend, label) + return NVTX.range_start(NVTX.init!(domain); message = label, color = color) +end + +function profiling_range_end(::CUDABackend, id) + NVTX.range_end(id) + return nothing +end + +end # module diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl index 28e4c47d339..12fde9e6396 100644 --- a/src/auxiliary/auxiliary.jl +++ b/src/auxiliary/auxiliary.jl @@ -82,6 +82,35 @@ end return ncalls_first end +# TODO: move to KernelAbstractions +""" + profiling_range_active(backend) + +Returns `true` if the given `backend` supports range annotations and a profiler is active, `false` otherwise. +""" +function profiling_range_active(backend::Any) + return false +end + +""" + profiling_range_start(backend, label) + +Starts a range annotation for the given `backend` with the specified `label`. +Returns a handle to the started range, which should be passed to `profiling_range_end` to end the range annotation. +""" +function profiling_range_start(backend::Any, label) + return nothing +end + +""" + profiling_range_end(backend, id) + +Ends a range annotation for the given `backend` with the specified `id`. +""" +function profiling_range_end(backend::Any, id) + return nothing +end + """ @trixi_timeit_ext backend timer() "some label" expression @@ -93,10 +122,17 @@ See also [`@trixi_timeit`](@ref). """ macro trixi_timeit_ext(backend, timer_output, label, expr) expr = quote + local active = $profiling_range_active($(esc(backend))) + if active + id = $profiling_range_start($(esc(backend)), $(esc(label))) + end local val = $(esc(expr)) if $(esc(backend)) !== nothing && $(TrixiBase).timeit_debug_enabled() $(KernelAbstractions.synchronize)($(esc(backend))) end + if active + $profiling_range_end($(esc(backend)), id) + end val end return :(@trixi_timeit($(esc(timer_output)), $(esc(label)), $(expr)))