From 353bd83186b78d349a5d4a7f1cd6bc6ad191520c Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 1 Apr 2026 15:12:46 +0200 Subject: [PATCH 1/4] Add support for tracing profilers like Nvidia NSight System and Intel VTune Delay init of domain fixup: formatting add color --- Project.toml | 6 ++++++ benchmark/CUDA/Project.toml | 2 ++ benchmark/CUDA/run.jl | 1 + ext/TrixiIntelITTExt.jl | 28 ++++++++++++++++++++++++++++ ext/TrixiNVTXExt.jl | 25 +++++++++++++++++++++++++ src/auxiliary/auxiliary.jl | 36 ++++++++++++++++++++++++++++++++++++ 6 files changed, 98 insertions(+) create mode 100644 ext/TrixiIntelITTExt.jl create mode 100644 ext/TrixiNVTXExt.jl diff --git a/Project.toml b/Project.toml index b7848d38455..626e5d5f128 100644 --- a/Project.toml +++ b/Project.toml @@ -58,8 +58,10 @@ AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" Convex = "f65535da-76fb-5f13-bab9-19810c17039a" ECOS = "e2685f51-7e38-5353-a97d-a921fd2c8199" +IntelITT = "c9b2f978-7543-4802-ae44-75068f23ee64" Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a" NLsolve = "2774e3e8-f4cf-5e23-947b-6d7e65073b56" +NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5" @@ -67,8 +69,10 @@ SparseConnectivityTracer = "9f842d2f-2579-4b1d-911e-f412cf18a3f5" TrixiAMDGPUExt = "AMDGPU" TrixiCUDAExt = "CUDA" TrixiConvexECOSExt = ["Convex", "ECOS"] +TrixiIntelITTExt = "IntelITT" TrixiMakieExt = "Makie" TrixiNLsolveExt = "NLsolve" +TrixiNVTXExt = ["NVTX", "CUDA"] TrixiPlotsExt = "Plots" TrixiSparseConnectivityTracerExt = "SparseConnectivityTracer" @@ -91,6 +95,7 @@ EllipsisNotation = "1.0" FillArrays = "1.9" ForwardDiff = "0.10.38, 1" HDF5 = "0.17" +IntelITT = "0.2" KernelAbstractions = "0.9.38" LinearAlgebra = "1" LinearMaps = "2.7, 3.0" @@ -99,6 +104,7 @@ MPI = "0.20.23" Makie = "0.22, 0.23, 0.24" MuladdMacro = "0.2.4" NLsolve = "4.5.1" +NVTX = "1.0.0" Octavian = "0.3.28" OffsetArrays = "1.13" P4est = "0.4.12" diff --git a/benchmark/CUDA/Project.toml b/benchmark/CUDA/Project.toml index 2e9f130fe6c..1bdbed2467b 100644 --- a/benchmark/CUDA/Project.toml +++ b/benchmark/CUDA/Project.toml @@ -1,6 +1,7 @@ [deps] CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +NVTX = "5da4648a-3479-48b8-97b9-01cb529c0a1f" OrdinaryDiffEqLowStorageRK = "b0944070-b475-4768-8dec-fb6eb410534d" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" Trixi = "a7f1ee26-1774-49b1-8366-f1abc58fbfcb" @@ -11,6 +12,7 @@ Trixi = {path = "../.."} [compat] CUDA = "5.8.2" JSON = "1.4.0" +NVTX = "1" OrdinaryDiffEqLowStorageRK = "1.12.0" TimerOutputs = "0.5.25" Trixi = "0.16" diff --git a/benchmark/CUDA/run.jl b/benchmark/CUDA/run.jl index b9d02246c9b..89907cb9e9b 100644 --- a/benchmark/CUDA/run.jl +++ b/benchmark/CUDA/run.jl @@ -1,5 +1,6 @@ using Trixi using CUDA +using NVTX # Load to get tracing support for Trixi using TimerOutputs using JSON diff --git a/ext/TrixiIntelITTExt.jl b/ext/TrixiIntelITTExt.jl new file mode 100644 index 00000000000..3ce1158ed6e --- /dev/null +++ b/ext/TrixiIntelITTExt.jl @@ -0,0 +1,28 @@ +module TrixiIntelITTExt + +using Trixi: CPU +import Trixi: trixi_range_active, trixi_range_start, trixi_range_end + +import IntelITT + +const domain = Ref{IntelITT.Domain}() +function __init__() + domain[] = IntelITT.Domain("Trixi") +end + +function trixi_range_active(::Union{Nothing, CPU}) + return IntelITT.isactive() +end + +function trixi_range_start(::Union{Nothing, CPU}, label) + task = IntelITT.Task(domain[], label) + IntelITT.start(task) + return task +end + +function trixi_range_end(::Union{Nothing, CPU}, id) + IntelITT.stop(id) + return nothing +end + +end # module diff --git a/ext/TrixiNVTXExt.jl b/ext/TrixiNVTXExt.jl new file mode 100644 index 00000000000..69c79c7a3d0 --- /dev/null +++ b/ext/TrixiNVTXExt.jl @@ -0,0 +1,25 @@ +module TrixiNVTXExt + +using NVTX +using CUDA: CUDABackend +import Trixi: trixi_range_active, trixi_range_start, trixi_range_end + +# One can also use Nsight Systems and thus NVTX for CPU code + +const domain = NVTX.Domain("Trixi") +const color = 0xff40e0d0 # turquoise + +function trixi_range_active(::CUDABackend) + return NVTX.isactive() +end + +function trixi_range_start(::CUDABackend, label) + return NVTX.range_start(NVTX.init!(domain); message = label, color = color) +end + +function trixi_range_end(::CUDABackend, id) + NVTX.range_end(id) + return nothing +end + +end # module diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl index 28e4c47d339..887b8e10746 100644 --- a/src/auxiliary/auxiliary.jl +++ b/src/auxiliary/auxiliary.jl @@ -82,6 +82,35 @@ end return ncalls_first end +# TODO: move to KernelAbstractions +""" + trixi_range_active(backend) + +Returns `true` if the given `backend` supports range annotations and a profiler is active, `false` otherwise. +""" +function trixi_range_active(backend::Any) + return false +end + +""" + trixi_range_start(backend, label) + +Starts a range annotation for the given `backend` with the specified `label`. +Returns a handle to the started range, which should be passed to `trixi_range_end` to end the range annotation. +""" +function trixi_range_start(backend::Any, label) + return nothing +end + +""" + trixi_range_end(backend, id) + +Ends a range annotation for the given `backend` with the specified `id`. +""" +function trixi_range_end(backend::Any, id) + return nothing +end + """ @trixi_timeit_ext backend timer() "some label" expression @@ -93,10 +122,17 @@ See also [`@trixi_timeit`](@ref). """ macro trixi_timeit_ext(backend, timer_output, label, expr) expr = quote + local active = $trixi_range_active($(esc(backend))) + if active + id = $trixi_range_start($(esc(backend)), $(esc(label))) + end local val = $(esc(expr)) if $(esc(backend)) !== nothing && $(TrixiBase).timeit_debug_enabled() $(KernelAbstractions.synchronize)($(esc(backend))) end + if active + $trixi_range_end($(esc(backend)), id) + end val end return :(@trixi_timeit($(esc(timer_output)), $(esc(label)), $(expr))) From 3bac91de5b0e5971a14f7125a581a1ec7cd83805 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Thu, 2 Apr 2026 16:35:53 +0200 Subject: [PATCH 2/4] add first documentation draft --- docs/src/performance.md | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/docs/src/performance.md b/docs/src/performance.md index 3d4ff8fdfce..505663735ae 100644 --- a/docs/src/performance.md +++ b/docs/src/performance.md @@ -288,3 +288,44 @@ requires. It can thus be seen as a proxy for "energy used" and, as an extension, timing result, you need to set the analysis interval such that the `AnalysisCallback` is invoked at least once during the course of the simulation and discard the first PID value. + +## Tracing support for profilers + +Trixi supports tracing profiler integration through [ittapi](https://github.com/intel/ittapi) for Intel VTune and [NVTX](https://github.com/NVIDIA/NVTX) for [NVIDIA Nsight Systems](https://developer.nvidia.com/nsight-systems). + +!!! note "Extensions" + Tracing support is implemented through extensions and requires trigger packages to be loaded. + +Tracing support is only available for regions that are instrumented with `@trixi_timeit_ext`. + +### Using Intel VTune + +We can use Intel VTune to profile CPU code. For more information see the [Julia documentation](https://docs.julialang.org/en/v1/manual/profile/#External-Profiling) and the [IntelITT.jl](https://github.com/JuliaPerf/IntelITT.jl) package. + +!!! note "Trigger package" + ```julia + using IntelITT + ``` + +To get the most out of Intel VTune we recommend passing the environment flag `ENABLE_JITPROFILING=1` to Julia, which will allow you to symbolize JIT compiled call frames. + +!!! note "Usage of `juliaup`" + Sometime `juliaup` can make it harder for a profiler to attach to the right process. You can use `Base.julia_cmd()` in the REPL to obtain the path to the actual Julia binary you will be running. + + +### NVIDIA Nsight Systems + +We can use NVIDIA Nsight Systems to trace GPU. + +We recommend reading the CUDA.jl documentation on using [Nsight Systems](https://cuda.juliagpu.org/stable/development/profiling/#NVIDIA-Nsight-Systems) + +!!! note "Trigger package" + ```julia + using CUDA + using NVTX + ``` + +You can also just use `CUDA.@profile` (see [Integrated Profiler](https://cuda.juliagpu.org/stable/development/profiling/#Integrated-profiler)) to obtain profiler results that include the NVTX ranges. + +#### Known limitation +Nsight Systems can also be used for CPU and in particular MPI codes. The Trixi extension will only be enabled when GPU backend is being used. From 58fedb15dd98f967bbe240eba023a7e717027e6f Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Thu, 23 Apr 2026 16:01:30 +0200 Subject: [PATCH 3/4] Apply suggestions from code review Co-authored-by: Michael Schlottke-Lakemper --- docs/src/performance.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/docs/src/performance.md b/docs/src/performance.md index 505663735ae..a147b26bf59 100644 --- a/docs/src/performance.md +++ b/docs/src/performance.md @@ -291,12 +291,12 @@ requires. It can thus be seen as a proxy for "energy used" and, as an extension, ## Tracing support for profilers -Trixi supports tracing profiler integration through [ittapi](https://github.com/intel/ittapi) for Intel VTune and [NVTX](https://github.com/NVIDIA/NVTX) for [NVIDIA Nsight Systems](https://developer.nvidia.com/nsight-systems). +Trixi.jl supports tracing profiler integration through [ittapi](https://github.com/intel/ittapi) for Intel VTune and [NVTX](https://github.com/NVIDIA/NVTX) for [NVIDIA Nsight Systems](https://developer.nvidia.com/nsight-systems). !!! note "Extensions" Tracing support is implemented through extensions and requires trigger packages to be loaded. -Tracing support is only available for regions that are instrumented with `@trixi_timeit_ext`. +Tracing support is only available for regions that are instrumented with [`@trixi_timeit_ext`](@ref). ### Using Intel VTune @@ -308,6 +308,7 @@ We can use Intel VTune to profile CPU code. For more information see the [Julia ``` To get the most out of Intel VTune we recommend passing the environment flag `ENABLE_JITPROFILING=1` to Julia, which will allow you to symbolize JIT compiled call frames. +Otherwise, instead of the Julia function names you will only see anonymous C function calls in the trace. !!! note "Usage of `juliaup`" Sometime `juliaup` can make it harder for a profiler to attach to the right process. You can use `Base.julia_cmd()` in the REPL to obtain the path to the actual Julia binary you will be running. @@ -328,4 +329,4 @@ We recommend reading the CUDA.jl documentation on using [Nsight Systems](https:/ You can also just use `CUDA.@profile` (see [Integrated Profiler](https://cuda.juliagpu.org/stable/development/profiling/#Integrated-profiler)) to obtain profiler results that include the NVTX ranges. #### Known limitation -Nsight Systems can also be used for CPU and in particular MPI codes. The Trixi extension will only be enabled when GPU backend is being used. +Nsight Systems can also be used for CPU and in particular MPI codes. The Trixi.jl extension will only be enabled when GPU backend is being used. From e42a6e7963285bb39eab3832c0e6a8e9d06123b1 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Thu, 23 Apr 2026 16:11:59 +0200 Subject: [PATCH 4/4] Address PR review comments - Rename `trixi_range_*` to `profiling_range_*` - Add descriptive comments to TrixiIntelITTExt and TrixiNVTXExt - Fix formatting at EOF in TrixiIntelITTExt and TrixiNVTXExt Co-authored-by: Antigravity Co-authored-by: Gemini 3.1 Pro (High) --- ext/TrixiIntelITTExt.jl | 12 +++++++----- ext/TrixiNVTXExt.jl | 10 ++++++---- src/auxiliary/auxiliary.jl | 20 ++++++++++---------- 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/ext/TrixiIntelITTExt.jl b/ext/TrixiIntelITTExt.jl index 3ce1158ed6e..51e2f9ec752 100644 --- a/ext/TrixiIntelITTExt.jl +++ b/ext/TrixiIntelITTExt.jl @@ -1,7 +1,9 @@ module TrixiIntelITTExt +# This extension provides tracing profiler integration for Intel VTune via IntelITT.jl. + using Trixi: CPU -import Trixi: trixi_range_active, trixi_range_start, trixi_range_end +import Trixi: profiling_range_active, profiling_range_start, profiling_range_end import IntelITT @@ -10,19 +12,19 @@ function __init__() domain[] = IntelITT.Domain("Trixi") end -function trixi_range_active(::Union{Nothing, CPU}) +function profiling_range_active(::Union{Nothing, CPU}) return IntelITT.isactive() end -function trixi_range_start(::Union{Nothing, CPU}, label) +function profiling_range_start(::Union{Nothing, CPU}, label) task = IntelITT.Task(domain[], label) IntelITT.start(task) return task end -function trixi_range_end(::Union{Nothing, CPU}, id) +function profiling_range_end(::Union{Nothing, CPU}, id) IntelITT.stop(id) return nothing end -end # module +end diff --git a/ext/TrixiNVTXExt.jl b/ext/TrixiNVTXExt.jl index 69c79c7a3d0..8125b31400b 100644 --- a/ext/TrixiNVTXExt.jl +++ b/ext/TrixiNVTXExt.jl @@ -1,23 +1,25 @@ module TrixiNVTXExt +# This extension provides tracing profiler integration for NVIDIA Nsight Systems via NVTX.jl. + using NVTX using CUDA: CUDABackend -import Trixi: trixi_range_active, trixi_range_start, trixi_range_end +import Trixi: profiling_range_active, profiling_range_start, profiling_range_end # One can also use Nsight Systems and thus NVTX for CPU code const domain = NVTX.Domain("Trixi") const color = 0xff40e0d0 # turquoise -function trixi_range_active(::CUDABackend) +function profiling_range_active(::CUDABackend) return NVTX.isactive() end -function trixi_range_start(::CUDABackend, label) +function profiling_range_start(::CUDABackend, label) return NVTX.range_start(NVTX.init!(domain); message = label, color = color) end -function trixi_range_end(::CUDABackend, id) +function profiling_range_end(::CUDABackend, id) NVTX.range_end(id) return nothing end diff --git a/src/auxiliary/auxiliary.jl b/src/auxiliary/auxiliary.jl index 887b8e10746..12fde9e6396 100644 --- a/src/auxiliary/auxiliary.jl +++ b/src/auxiliary/auxiliary.jl @@ -84,30 +84,30 @@ end # TODO: move to KernelAbstractions """ - trixi_range_active(backend) + profiling_range_active(backend) Returns `true` if the given `backend` supports range annotations and a profiler is active, `false` otherwise. """ -function trixi_range_active(backend::Any) +function profiling_range_active(backend::Any) return false end """ - trixi_range_start(backend, label) + profiling_range_start(backend, label) Starts a range annotation for the given `backend` with the specified `label`. -Returns a handle to the started range, which should be passed to `trixi_range_end` to end the range annotation. +Returns a handle to the started range, which should be passed to `profiling_range_end` to end the range annotation. """ -function trixi_range_start(backend::Any, label) +function profiling_range_start(backend::Any, label) return nothing end """ - trixi_range_end(backend, id) + profiling_range_end(backend, id) Ends a range annotation for the given `backend` with the specified `id`. """ -function trixi_range_end(backend::Any, id) +function profiling_range_end(backend::Any, id) return nothing end @@ -122,16 +122,16 @@ See also [`@trixi_timeit`](@ref). """ macro trixi_timeit_ext(backend, timer_output, label, expr) expr = quote - local active = $trixi_range_active($(esc(backend))) + local active = $profiling_range_active($(esc(backend))) if active - id = $trixi_range_start($(esc(backend)), $(esc(label))) + id = $profiling_range_start($(esc(backend)), $(esc(label))) end local val = $(esc(expr)) if $(esc(backend)) !== nothing && $(TrixiBase).timeit_debug_enabled() $(KernelAbstractions.synchronize)($(esc(backend))) end if active - $trixi_range_end($(esc(backend)), id) + $profiling_range_end($(esc(backend)), id) end val end