From 35e55ddc8a862b676009359edaa80be6497ce506 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Fri, 20 Feb 2026 11:55:48 +0100 Subject: [PATCH 01/11] =?UTF-8?q?add=20Diagonal=20overload=20f=C3=BCr=20ma?= =?UTF-8?q?ss=20matrix?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl | 5 +++++ .../test/sparse_algebraic_detection_tests.jl | 10 ++++++++++ 2 files changed, 15 insertions(+) diff --git a/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl b/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl index bee55d5f6b6..668e9745799 100644 --- a/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl +++ b/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl @@ -29,6 +29,11 @@ function find_algebraic_vars_eqs(M::SparseMatrixCSC) return algebraic_vars, algebraic_eqs end +function find_algebraic_vars_eqs(M::LinearAlgebra.Diagonal) + _idxs = map(iszero, LinearAlgebra.diag(M)) + return _idxs, _idxs +end + # Fallback for non-sparse matrices (original behavior) function find_algebraic_vars_eqs(M::AbstractMatrix) algebraic_vars = vec(all(iszero, M, dims = 1)) diff --git a/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl b/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl index 894b87af451..1f5cb9eca01 100644 --- a/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl +++ b/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl @@ -79,4 +79,14 @@ using OrdinaryDiffEqNonlinearSolve: find_algebraic_vars_eqs @test vars == [false, true] @test eqs == [false, true] end + + # Test 4: Test Diagonal case + @testset "Test Diagonal cast" begin + M_diag = Diagonal([1.0, 0.0, 1.0, 1.0, 0.0]) + vars, eqs = find_algebraic_vars_eqs(M_diag) + @test vars == [false, true, false, false, true] + @test eqs == [false, true, false, false, true] + # compare to dense + @test find_algebraic_vars_eqs(M_diag) == find_algebraic_vars_eqs(collect(M_diag)) + end end From 1ae3b732448b4ac01dd2f20fa32170e7fe0e14e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Fri, 20 Feb 2026 19:07:57 +0100 Subject: [PATCH 02/11] fix dae gpu compat --- .../src/derivative_utils.jl | 10 ++++- test/gpu/simple_dae.jl | 40 ++++++++++++------- 2 files changed, 33 insertions(+), 17 deletions(-) diff --git a/lib/OrdinaryDiffEqDifferentiation/src/derivative_utils.jl b/lib/OrdinaryDiffEqDifferentiation/src/derivative_utils.jl index 51608029b91..a976e2e28bf 100644 --- a/lib/OrdinaryDiffEqDifferentiation/src/derivative_utils.jl +++ b/lib/OrdinaryDiffEqDifferentiation/src/derivative_utils.jl @@ -14,7 +14,7 @@ function calc_tderivative!(integrator, cache, dtd1, repeat_step) tf.p = p alg = unwrap_alg(integrator, true) - autodiff_alg = ADTypes.dense_ad(gpu_safe_autodiff(alg_autodiff(alg), u)) + autodiff_alg = gpu_safe_autodiff(ADTypes.dense_ad(alg_autodiff(alg)), u) # Convert t to eltype(dT) if using ForwardDiff, to make FunctionWrappers work t = autodiff_alg isa AutoForwardDiff ? convert(eltype(dT), t) : t @@ -59,7 +59,7 @@ function calc_tderivative(integrator, cache) tf.u = uprev tf.p = p - autodiff_alg = ADTypes.dense_ad(gpu_safe_autodiff(alg_autodiff(alg), u)) + autodiff_alg = gpu_safe_autodiff(ADTypes.dense_ad(alg_autodiff(alg)), u) if alg_autodiff isa AutoFiniteDiff autodiff_alg = SciMLBase.@set autodiff_alg.dir = diffdir(integrator) @@ -403,6 +403,12 @@ function jacobian2W!( else @.. broadcast = false @view(W[idxs]) = muladd(λ, invdtgamma, @view(J[idxs])) end + elseif is_sparse(W) && !ArrayInterface.fast_scalar_indexing(nonzeros(W)) + # Sparse GPU arrays (e.g. CuSparseMatrixCSC/CSR) don't support broadcasting. + # ArrayInterface.fast_scalar_indexing is not specialized for AbstractGPUSparseArray, + # so we detect them by checking if the underlying nonzeros storage is a GPU array. + # we then fall back to allocating matrix arithmetic + copyto!(W, J - invdtgamma * mass_matrix) else @.. broadcast = false W = muladd(-mass_matrix, invdtgamma, J) end diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl index 1b39a42cd1a..c1a346f0108 100644 --- a/test/gpu/simple_dae.jl +++ b/test/gpu/simple_dae.jl @@ -5,6 +5,7 @@ using LinearAlgebra using Adapt using SparseArrays using Test +using CUDSS #= du[1] = -u[1] @@ -38,30 +39,39 @@ tspan = (0.0, 5.0) prob = ODEProblem(odef, u0, tspan, p) sol = solve(prob, Rodas5P()) -# gpu version mass_matrix_d = adapt(CuArray, mass_matrix) - -# TODO: jac_prototype fails -# jac_prototype_d = adapt(CuArray, jac_prototype) -# jac_prototype_d = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_prototype) -jac_prototype_d = nothing +jac_prototype_d_csc = CUDA.CUSPARSE.CuSparseMatrixCSC(jac_prototype) +jac_prototype_d_csr = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_prototype) u0_d = adapt(CuArray, u0) p_d = adapt(CuArray, p) -odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d) -prob_d = ODEProblem(odef_d, u0_d, tspan, p_d) -sol_d = solve(prob_d, Rodas5P()) + +odef_d_dns = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = nothing) +odef_d_csc = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d_csc) +odef_d_csr = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d_csr) + +prob_d_dns = ODEProblem(odef_d_dns, u0_d, tspan, p_d) +prob_d_csc = ODEProblem(odef_d_csc, u0_d, tspan, p_d) +prob_d_csr = ODEProblem(odef_d_csr, u0_d, tspan, p_d) + +sol_d_dns = solve(prob_d_dns, Rodas5P()) # works +sol_d_csc = solve(prob_d_csc, Rodas5P()) # uses Krylov +sol_d_csr = solve(prob_d_csr, Rodas5P()) # uses CUDSS @testset "Test constraints in GPU sol" begin - for t in sol_d.t - u = Vector(sol_d(t)) - @test isapprox(u[1] + u[2], u[3]; atol = 1.0e-6) - @test isapprox(-u[1] + u[2], u[4]; atol = 1.0e-6) + for sol_d in [sol_d_dns, sol_d_csc, sol_d_csr] + for t in sol_d.t + u = Vector(sol_d(t)) + @test isapprox(u[1] + u[2], u[3]; atol = 1.0e-6) + @test isapprox(-u[1] + u[2], u[4]; atol = 1.0e-6) + end end end @testset "Compare GPU to CPU solution" begin - for t in tspan[begin]:0.1:tspan[end] - @test isapprox(Vector(sol_d(t)), sol(t); rtol = 1.0e-4) + for sol_d in [sol_d_dns, sol_d_csc, sol_d_csr] + for t in tspan[begin]:0.1:tspan[end] + @test Vector(sol_d(t)) ≈ sol(t) atol=1e-5 + end end end From f83c90dc7dfee5d1b354e8cc797c598e3960d415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Mon, 23 Feb 2026 08:34:34 +0100 Subject: [PATCH 03/11] move find_algebraic_vars_eqs to OrdinayDiffEqCore --- .../ext/OrdinaryDiffEqCoreSparseArraysExt.jl | 26 +++++++++- .../src/OrdinaryDiffEqCore.jl | 2 +- lib/OrdinaryDiffEqCore/src/misc_utils.jl | 24 ++++++++++ .../test/algebraic_vars_detection_tests.jl} | 2 +- lib/OrdinaryDiffEqCore/test/runtests.jl | 1 + .../src/OrdinaryDiffEqNonlinearSolve.jl | 2 +- .../src/initialize_dae.jl | 48 ------------------- .../test/runtests.jl | 1 - 8 files changed, 53 insertions(+), 53 deletions(-) rename lib/{OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl => OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl} (98%) diff --git a/lib/OrdinaryDiffEqCore/ext/OrdinaryDiffEqCoreSparseArraysExt.jl b/lib/OrdinaryDiffEqCore/ext/OrdinaryDiffEqCoreSparseArraysExt.jl index 6b57562971f..6a311b0792b 100644 --- a/lib/OrdinaryDiffEqCore/ext/OrdinaryDiffEqCoreSparseArraysExt.jl +++ b/lib/OrdinaryDiffEqCore/ext/OrdinaryDiffEqCoreSparseArraysExt.jl @@ -1,7 +1,7 @@ module OrdinaryDiffEqCoreSparseArraysExt using SparseArrays: SparseMatrixCSC -import OrdinaryDiffEqCore: _isdiag +import OrdinaryDiffEqCore: _isdiag, find_algebraic_vars_eqs # Efficient O(nnz) isdiag check for sparse matrices. # Standard isdiag is O(n²) which is prohibitively slow for large sparse matrices. @@ -22,4 +22,28 @@ function _isdiag(A::SparseMatrixCSC) return true end +""" + find_algebraic_vars_eqs(M::SparseMatrixCSC) + +O(nnz) detection of algebraic variables (zero columns) and equations (zero rows). +""" +function find_algebraic_vars_eqs(M::SparseMatrixCSC) + n_cols = size(M, 2) + n_rows = size(M, 1) + + algebraic_vars = fill(true, n_cols) + algebraic_eqs = fill(true, n_rows) + + @inbounds for j in 1:n_cols + for idx in M.colptr[j]:(M.colptr[j + 1] - 1) + if !iszero(M.nzval[idx]) + algebraic_vars[j] = false + algebraic_eqs[M.rowval[idx]] = false + end + end + end + + return algebraic_vars, algebraic_eqs +end + end diff --git a/lib/OrdinaryDiffEqCore/src/OrdinaryDiffEqCore.jl b/lib/OrdinaryDiffEqCore/src/OrdinaryDiffEqCore.jl index fbfd8a743b7..d774a540d2d 100644 --- a/lib/OrdinaryDiffEqCore/src/OrdinaryDiffEqCore.jl +++ b/lib/OrdinaryDiffEqCore/src/OrdinaryDiffEqCore.jl @@ -15,7 +15,7 @@ import Logging: @logmsg, LogLevel using MuladdMacro: @muladd -using LinearAlgebra: opnorm, I, UniformScaling, diag, rank, isdiag +using LinearAlgebra: opnorm, I, UniformScaling, diag, rank, isdiag, Diagonal import PrecompileTools diff --git a/lib/OrdinaryDiffEqCore/src/misc_utils.jl b/lib/OrdinaryDiffEqCore/src/misc_utils.jl index a5c672cba95..65a6e56c21b 100644 --- a/lib/OrdinaryDiffEqCore/src/misc_utils.jl +++ b/lib/OrdinaryDiffEqCore/src/misc_utils.jl @@ -131,6 +131,30 @@ end # Sparse specialization is provided in OrdinaryDiffEqCoreSparseArraysExt _isdiag(A::AbstractMatrix) = isdiag(A) +""" + find_algebraic_vars_eqs(M) + +Find algebraic variables (zero columns) and algebraic equations (zero rows) from mass matrix. +Returns `(algebraic_vars, algebraic_eqs)` as boolean arrays (true = algebraic). + +Works on CPU and GPU arrays. Sparse specialization (O(nnz)) is provided in +OrdinaryDiffEqCoreSparseArraysExt. +""" +function find_algebraic_vars_eqs(M::Diagonal) + _idxs = map(iszero, diag(M)) + return _idxs, _idxs +end + +function find_algebraic_vars_eqs(M::AbstractMatrix) + algebraic_vars = vec(all(iszero, M, dims=1)) + algebraic_eqs = vec(all(iszero, M, dims=2)) + return algebraic_vars, algebraic_eqs +end + +function find_algebraic_vars_eqs(M::AbstractSciMLOperator) + return find_algebraic_vars_eqs(convert(AbstractMatrix, M)) +end + isnewton(::Any) = false function _bool_to_ADType(::Val{true}, ::Val{CS}, _) where {CS} diff --git a/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl b/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl similarity index 98% rename from lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl rename to lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl index 1f5cb9eca01..3b3b3caa1af 100644 --- a/lib/OrdinaryDiffEqNonlinearSolve/test/sparse_algebraic_detection_tests.jl +++ b/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl @@ -1,6 +1,6 @@ using Test using SparseArrays -using OrdinaryDiffEqNonlinearSolve: find_algebraic_vars_eqs +using OrdinaryDiffEqCore: find_algebraic_vars_eqs @testset "Sparse Algebraic Detection Performance" begin # Test 1: Correctness - results should match between sparse and dense methods diff --git a/lib/OrdinaryDiffEqCore/test/runtests.jl b/lib/OrdinaryDiffEqCore/test/runtests.jl index 2a9ad4e8615..f5a294c3d20 100644 --- a/lib/OrdinaryDiffEqCore/test/runtests.jl +++ b/lib/OrdinaryDiffEqCore/test/runtests.jl @@ -24,4 +24,5 @@ end # Functional tests if TEST_GROUP == "Core" || TEST_GROUP == "ALL" @time @safetestset "Sparse isdiag Performance" include("sparse_isdiag_tests.jl") + @time @safetestset "Algebraic Vars Detection" include("algebraic_vars_detection_tests.jl") end diff --git a/lib/OrdinaryDiffEqNonlinearSolve/src/OrdinaryDiffEqNonlinearSolve.jl b/lib/OrdinaryDiffEqNonlinearSolve/src/OrdinaryDiffEqNonlinearSolve.jl index 60cd480372f..f72e8189de2 100644 --- a/lib/OrdinaryDiffEqNonlinearSolve/src/OrdinaryDiffEqNonlinearSolve.jl +++ b/lib/OrdinaryDiffEqNonlinearSolve/src/OrdinaryDiffEqNonlinearSolve.jl @@ -50,7 +50,7 @@ using OrdinaryDiffEqCore: resize_nlsolver!, _initialize_dae!, FastConvergence, Convergence, SlowConvergence, VerySlowConvergence, Divergence, NLStatus, MethodType, alg_order, error_constant, - alg_extrapolates, resize_J_W!, has_autodiff + alg_extrapolates, resize_J_W!, has_autodiff, find_algebraic_vars_eqs import OrdinaryDiffEqCore: _initialize_dae!, _default_dae_init!, isnewton, get_W, isfirstcall, isfirststage, diff --git a/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl b/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl index 668e9745799..b2c4ba9cc5e 100644 --- a/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl +++ b/lib/OrdinaryDiffEqNonlinearSolve/src/initialize_dae.jl @@ -1,51 +1,3 @@ -# Efficient algebraic variable/equation detection for sparse mass matrices. -# O(nnz) instead of O(n²) for sparse matrices. -""" - find_algebraic_vars_eqs(M::SparseMatrixCSC) - -Find algebraic variables (zero columns) and algebraic equations (zero rows) from mass matrix. -Returns (algebraic_vars::Vector{Bool}, algebraic_eqs::Vector{Bool}). - -For sparse matrices, uses O(nnz) traversal of CSC structure instead of O(n²) iteration. -""" -function find_algebraic_vars_eqs(M::SparseMatrixCSC) - n_cols = size(M, 2) - n_rows = size(M, 1) - - # Initialize all as algebraic (true = zero column/row) - algebraic_vars = fill(true, n_cols) - algebraic_eqs = fill(true, n_rows) - - # Mark columns/rows with non-zero values as differential (false) - @inbounds for j in 1:n_cols - for idx in M.colptr[j]:(M.colptr[j + 1] - 1) - if !iszero(M.nzval[idx]) - algebraic_vars[j] = false - algebraic_eqs[M.rowval[idx]] = false - end - end - end - - return algebraic_vars, algebraic_eqs -end - -function find_algebraic_vars_eqs(M::LinearAlgebra.Diagonal) - _idxs = map(iszero, LinearAlgebra.diag(M)) - return _idxs, _idxs -end - -# Fallback for non-sparse matrices (original behavior) -function find_algebraic_vars_eqs(M::AbstractMatrix) - algebraic_vars = vec(all(iszero, M, dims = 1)) - algebraic_eqs = vec(all(iszero, M, dims = 2)) - return algebraic_vars, algebraic_eqs -end - -# Handle SciMLOperators (e.g., MatrixOperator) by converting to matrix -function find_algebraic_vars_eqs(M::AbstractSciMLOperator) - return find_algebraic_vars_eqs(convert(AbstractMatrix, M)) -end - # Optimized tolerance checking that avoids allocations @inline function check_dae_tolerance(integrator, err, abstol, t, ::Val{true}) if abstol isa Number diff --git a/lib/OrdinaryDiffEqNonlinearSolve/test/runtests.jl b/lib/OrdinaryDiffEqNonlinearSolve/test/runtests.jl index d2f6725597c..0839c157818 100644 --- a/lib/OrdinaryDiffEqNonlinearSolve/test/runtests.jl +++ b/lib/OrdinaryDiffEqNonlinearSolve/test/runtests.jl @@ -20,7 +20,6 @@ end # Run functional tests if TEST_GROUP ∉ ("QA", "ModelingToolkit") @time @safetestset "Newton Tests" include("newton_tests.jl") - @time @safetestset "Sparse Algebraic Detection" include("sparse_algebraic_detection_tests.jl") @time @safetestset "Sparse DAE Initialization" include("sparse_dae_initialization_tests.jl") @time @safetestset "Linear Nonlinear Solver Tests" include("linear_nonlinear_tests.jl") @time @safetestset "Linear Solver Tests" include("linear_solver_tests.jl") From f8a5ff4d6da045a21421bdd664bf454f7fbe548a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Mon, 23 Feb 2026 12:50:48 +0100 Subject: [PATCH 04/11] fix cache build for several solvers --- lib/OrdinaryDiffEqBDF/src/OrdinaryDiffEqBDF.jl | 2 +- lib/OrdinaryDiffEqBDF/src/bdf_caches.jl | 2 +- lib/OrdinaryDiffEqRosenbrock/src/OrdinaryDiffEqRosenbrock.jl | 3 ++- lib/OrdinaryDiffEqRosenbrock/src/rosenbrock_caches.jl | 4 ++-- lib/OrdinaryDiffEqSDIRK/src/OrdinaryDiffEqSDIRK.jl | 3 ++- lib/OrdinaryDiffEqSDIRK/src/sdirk_caches.jl | 2 +- 6 files changed, 9 insertions(+), 7 deletions(-) diff --git a/lib/OrdinaryDiffEqBDF/src/OrdinaryDiffEqBDF.jl b/lib/OrdinaryDiffEqBDF/src/OrdinaryDiffEqBDF.jl index e6e9a07d884..e8cb7cf5e1f 100644 --- a/lib/OrdinaryDiffEqBDF/src/OrdinaryDiffEqBDF.jl +++ b/lib/OrdinaryDiffEqBDF/src/OrdinaryDiffEqBDF.jl @@ -23,7 +23,7 @@ import OrdinaryDiffEqCore: alg_order, calculate_residuals!, get_fsalfirstlast, generic_solver_docstring, _bool_to_ADType, _process_AD_choice, _ode_interpolant, _ode_interpolant!, has_stiff_interpolation, - _ode_addsteps!, DerivativeOrderNotPossibleError + _ode_addsteps!, DerivativeOrderNotPossibleError, find_algebraic_vars_eqs using OrdinaryDiffEqSDIRK: ImplicitEulerConstantCache, ImplicitEulerCache using TruncatedStacktraces: @truncate_stacktrace diff --git a/lib/OrdinaryDiffEqBDF/src/bdf_caches.jl b/lib/OrdinaryDiffEqBDF/src/bdf_caches.jl index 80b92a6f70c..5d71d3995ea 100644 --- a/lib/OrdinaryDiffEqBDF/src/bdf_caches.jl +++ b/lib/OrdinaryDiffEqBDF/src/bdf_caches.jl @@ -62,7 +62,7 @@ function alg_cache( atmp = similar(u, uEltypeNoUnits) recursivefill!(atmp, false) algebraic_vars = f.mass_matrix === I ? nothing : - [all(iszero, x) for x in eachcol(f.mass_matrix)] + find_algebraic_vars_eqs(f.mass_matrix)[1] eulercache = ImplicitEulerCache( u, uprev, uprev2, fsalfirst, atmp, nlsolver, algebraic_vars, alg.step_limiter! diff --git a/lib/OrdinaryDiffEqRosenbrock/src/OrdinaryDiffEqRosenbrock.jl b/lib/OrdinaryDiffEqRosenbrock/src/OrdinaryDiffEqRosenbrock.jl index 23913546d00..9b269d923e8 100644 --- a/lib/OrdinaryDiffEqRosenbrock/src/OrdinaryDiffEqRosenbrock.jl +++ b/lib/OrdinaryDiffEqRosenbrock/src/OrdinaryDiffEqRosenbrock.jl @@ -13,7 +13,8 @@ import OrdinaryDiffEqCore: alg_order, alg_adaptive_order, isWmethod, isfsal, _un calculate_residuals, has_stiff_interpolation, ODEIntegrator, resize_non_user_cache!, _ode_addsteps!, full_cache, DerivativeOrderNotPossibleError, _bool_to_ADType, - _process_AD_choice, LinearAliasSpecifier, copyat_or_push! + _process_AD_choice, LinearAliasSpecifier, copyat_or_push!, + find_algebraic_vars_eqs using MuladdMacro, FastBroadcast, RecursiveArrayTools import MacroTools: namify using MacroTools: @capture diff --git a/lib/OrdinaryDiffEqRosenbrock/src/rosenbrock_caches.jl b/lib/OrdinaryDiffEqRosenbrock/src/rosenbrock_caches.jl index d99fd25496c..7db92ffbb8c 100644 --- a/lib/OrdinaryDiffEqRosenbrock/src/rosenbrock_caches.jl +++ b/lib/OrdinaryDiffEqRosenbrock/src/rosenbrock_caches.jl @@ -179,7 +179,7 @@ function alg_cache( ) algebraic_vars = f.mass_matrix === I ? nothing : - [all(iszero, x) for x in eachcol(f.mass_matrix)] + find_algebraic_vars_eqs(f.mass_matrix)[1] return Rosenbrock23Cache( u, uprev, k₁, k₂, k₃, du1, du2, f₁, @@ -239,7 +239,7 @@ function alg_cache( ) algebraic_vars = f.mass_matrix === I ? nothing : - [all(iszero, x) for x in eachcol(f.mass_matrix)] + find_algebraic_vars_eqs(f.mass_matrix)[1] return Rosenbrock32Cache( u, uprev, k₁, k₂, k₃, du1, du2, f₁, fsalfirst, fsallast, dT, J, W, diff --git a/lib/OrdinaryDiffEqSDIRK/src/OrdinaryDiffEqSDIRK.jl b/lib/OrdinaryDiffEqSDIRK/src/OrdinaryDiffEqSDIRK.jl index 37eb64be2c2..c22a23b8bd3 100644 --- a/lib/OrdinaryDiffEqSDIRK/src/OrdinaryDiffEqSDIRK.jl +++ b/lib/OrdinaryDiffEqSDIRK/src/OrdinaryDiffEqSDIRK.jl @@ -14,7 +14,8 @@ import OrdinaryDiffEqCore: alg_order, calculate_residuals!, trivial_limiter!, _ode_interpolant!, isesdirk, issplit, ssp_coefficient, get_fsalfirstlast, generic_solver_docstring, - _bool_to_ADType, _process_AD_choice, current_extrapolant! + _bool_to_ADType, _process_AD_choice, current_extrapolant!, + find_algebraic_vars_eqs using TruncatedStacktraces: @truncate_stacktrace using MuladdMacro, MacroTools, FastBroadcast, RecursiveArrayTools using SciMLBase: SplitFunction diff --git a/lib/OrdinaryDiffEqSDIRK/src/sdirk_caches.jl b/lib/OrdinaryDiffEqSDIRK/src/sdirk_caches.jl index 978120e3b86..70f63fd4708 100644 --- a/lib/OrdinaryDiffEqSDIRK/src/sdirk_caches.jl +++ b/lib/OrdinaryDiffEqSDIRK/src/sdirk_caches.jl @@ -35,7 +35,7 @@ function alg_cache( recursivefill!(atmp, false) algebraic_vars = f.mass_matrix === I ? nothing : - [all(iszero, x) for x in eachcol(f.mass_matrix)] + find_algebraic_vars_eqs(f.mass_matrix)[1] return ImplicitEulerCache( u, uprev, uprev2, fsalfirst, atmp, nlsolver, algebraic_vars, alg.step_limiter! From 6f48f856647a6a8ed6fa0158d09b878e5b88cca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Mon, 23 Feb 2026 10:49:27 +0100 Subject: [PATCH 05/11] WIP test script --- test/gpu/simple_dae.jl | 206 ++++++++++++++++++++++++++++++++++------- 1 file changed, 175 insertions(+), 31 deletions(-) diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl index c1a346f0108..0849fc604af 100644 --- a/test/gpu/simple_dae.jl +++ b/test/gpu/simple_dae.jl @@ -1,4 +1,7 @@ using OrdinaryDiffEqRosenbrock +using OrdinaryDiffEqSDIRK +using OrdinaryDiffEqBDF +using OrdinaryDiffEqFIRK using OrdinaryDiffEqNonlinearSolve using CUDA using LinearAlgebra @@ -6,6 +9,7 @@ using Adapt using SparseArrays using Test using CUDSS +using Printf #= du[1] = -u[1] @@ -25,53 +29,193 @@ p = [ -1 1 0 -1 ] -# mass_matrix = [1 0 0 0 -# 0 1 0 0 -# 0 0 0 0 -# 0 0 0 0] mass_matrix = Diagonal([1, 1, 0, 0]) jac_prototype = sparse(map(x -> iszero(x) ? 0.0 : 1.0, p)) u0 = [1.0, 1.0, 0.5, 0.5] # force init -odef = ODEFunction(dae!, mass_matrix = mass_matrix, jac_prototype = jac_prototype) - tspan = (0.0, 5.0) + +# CPU reference solution +odef = ODEFunction(dae!, mass_matrix = mass_matrix, jac_prototype = jac_prototype) prob = ODEProblem(odef, u0, tspan, p) sol = solve(prob, Rodas5P()) -mass_matrix_d = adapt(CuArray, mass_matrix) -jac_prototype_d_csc = CUDA.CUSPARSE.CuSparseMatrixCSC(jac_prototype) -jac_prototype_d_csr = CUDA.CUSPARSE.CuSparseMatrixCSR(jac_prototype) +# GPU data -- we use F64 for higher accuracy for comparision +u0_d = adapt(CuArray{Float64}, u0) +p_d = adapt(CuArray{Float64}, p) -u0_d = adapt(CuArray, u0) -p_d = adapt(CuArray, p) +# dense or spares mass matrix does not work yet! +mass_matrix_d = cu(mass_matrix) + +function test_gpu_dae(jac_prototype_d, solver) + sol_ref = solve(prob, solver) + odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d) + prob_d = ODEProblem(odef_d, u0_d, tspan, p_d) + sol_d = solve(prob_d, solver) + + for t in sol_d.t + u = Vector(sol_d(t)) + @test isapprox(u[1] + u[2], u[3]; atol = 1.0e-6) + @test isapprox(-u[1] + u[2], u[4]; atol = 1.0e-6) + end -odef_d_dns = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = nothing) -odef_d_csc = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d_csc) -odef_d_csr = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d_csr) + max_abserr = 0.0 + max_relerr = 0.0 + for t in tspan[begin]:0.1:tspan[end] + diff = abs.(Vector(sol_d(t)) - sol_ref(t)) + ref = abs.(sol_ref(t)) + max_abserr = max(max_abserr, maximum(diff)) + max_relerr = max(max_relerr, maximum(diff ./ max.(ref, eps()))) + end + cond = (max_abserr < 1e-5 || max_relerr < 1e-5) + cond || println("Solver $sol abstol=$max_abserr retlol=$max_relerr") + @test (max_abserr < 1e-5 || max_relerr < 1e-5) +end -prob_d_dns = ODEProblem(odef_d_dns, u0_d, tspan, p_d) -prob_d_csc = ODEProblem(odef_d_csc, u0_d, tspan, p_d) -prob_d_csr = ODEProblem(odef_d_csr, u0_d, tspan, p_d) +# Jacobian prototype options +jac_prots = [ + "none" => nothing, + "CSC" => CUDA.CUSPARSE.CuSparseMatrixCSC(jac_prototype), + "CSR" => CUDA.CUSPARSE.CuSparseMatrixCSR(jac_prototype), +] -sol_d_dns = solve(prob_d_dns, Rodas5P()) # works -sol_d_csc = solve(prob_d_csc, Rodas5P()) # uses Krylov -sol_d_csr = solve(prob_d_csr, Rodas5P()) # uses CUDSS +# Solver options +solvers = [ + # Rosenbrock (diagonal mass matrix only) + "Rosenbrock23" => Rosenbrock23(), # ✅ + "Rosenbrock32" => Rosenbrock32(), # 📉 poor fit for this DAE, 💥 CSC catastrophic + # Rosenbrock 2nd order + "ROS2" => ROS2(), # ✅ + "ROS2PR" => ROS2PR(), # ✅ + "ROS2S" => ROS2S(), # ✅ + # Rosenbrock 3rd order + "ROS3" => ROS3(), # ✅ + "ROS3PR" => ROS3PR(), # 📉 poor fit for this DAE + "ROS3PRL" => ROS3PRL(), # ⚠️ CSC accuracy loss + "ROS3PRL2" => ROS3PRL2(), # ⚠️ CSC accuracy loss + "ROS3P" => ROS3P(), # 📉 poor fit for this DAE + "Rodas3" => Rodas3(), # ⚠️ CSC accuracy loss + # "Rodas23W" => Rodas23W(), # 🚫 scalar indexing, requires large cahnges to `calculate_interpoldiff!` + # "Rodas3P" => Rodas3P(), # 🚫 scalar indexing + "Scholz4_7" => Scholz4_7(), # ✅ + # Rosenbrock 4th order + "ROS34PW1a" => ROS34PW1a(), # 📉 poor fit for this DAE + "ROS34PW1b" => ROS34PW1b(), # 📉 poor fit for this DAE + "ROS34PW2" => ROS34PW2(), # ⚠️ CSC accuracy loss + "ROS34PW3" => ROS34PW3(), # ✅ + "ROS34PRw" => ROS34PRw(), # ✅ + "RosShamp4" => RosShamp4(), # ✅ + "Veldd4" => Veldd4(), # ✅ + "Velds4" => Velds4(), # ✅ + "GRK4T" => GRK4T(), # ✅ + "GRK4A" => GRK4A(), # ✅ + "Ros4LStab" => Ros4LStab(), # ✅ + "Rodas4" => Rodas4(), # ✅ + "Rodas42" => Rodas42(), # ✅ + "Rodas4P" => Rodas4P(), # ✅ + "Rodas4P2" => Rodas4P2(), # ✅ + "ROK4a" => ROK4a(), # ✅ + # Rosenbrock 5th order + "Rodas5" => Rodas5(), # ✅ + "Rodas5P" => Rodas5P(), # ✅ + "Rodas5Pe" => Rodas5Pe(), # ✅ + "Rodas5Pr" => Rodas5Pr(), # ✅ + # Rosenbrock 6th order + "Rodas6P" => Rodas6P(), # ✅ + # SDIRK (don't include fixed time step which need explicit dt) + "ImplicitEuler" => ImplicitEuler(), # ✅ + "Trapezoid" => Trapezoid(), # ✅ + "SDIRK2" => SDIRK2(), # ✅ + "Cash4" => Cash4(), # ⚠️ CSC accuracy loss + "Hairer4" => Hairer4(), # ⚠️ CSC accuracy loss + "Hairer42" => Hairer42(), # ⚠️ CSC accuracy loss + # BDF + "ABDF2" => ABDF2(), # ⚠️ CSC accuracy loss (💥 catastrophic) + "QNDF1" => QNDF1(), # ✅ + "QNDF2" => QNDF2(), # ⚠️ CSC accuracy loss + # "QNDF" => QNDF(), # 🔧 DeviceMemory error in LinAlg + "QBDF1" => QBDF1(), # ✅ + "QBDF2" => QBDF2(), # ⚠️ accuracy loss (all jac_prots) + # "QBDF" => QBDF(), # 🔧 DeviceMemory error in LinAlg + # "FBDF" => FBDF(), # 🚫 scalar indexing -> needs extensive work on reinitFBDF! + # FIRK -> all need subtential changes to `perform_step!` for FIRK methods + # "RadauIIA3" => RadauIIA3(), # 🚫 scalar indexing, ComplexF64 sparse unsupported + # "RadauIIA5" => RadauIIA5(), # 🚫 scalar indexing, ComplexF64 sparse unsupported + # "RadauIIA9" => RadauIIA9(), # 🚫 scalar indexing, ComplexF64 sparse unsupported + # "AdaptiveRadau" => AdaptiveRadau(), # 🚫 scalar indexing, ComplexF64 sparse unsupported +] -@testset "Test constraints in GPU sol" begin - for sol_d in [sol_d_dns, sol_d_csc, sol_d_csr] - for t in sol_d.t - u = Vector(sol_d(t)) - @test isapprox(u[1] + u[2], u[3]; atol = 1.0e-6) - @test isapprox(-u[1] + u[2], u[4]; atol = 1.0e-6) - end +function _maxerrs(sol_a, sol_b) + max_abs = 0.0 + max_rel = 0.0 + for t in tspan[begin]:0.1:tspan[end] + a = isa(sol_a(t), CuArray) ? Vector(sol_a(t)) : Vector(sol_a(t)) + b = isa(sol_b(t), CuArray) ? Vector(sol_b(t)) : Vector(sol_b(t)) + diff = abs.(a - b) + ref = abs.(b) + max_abs = max(max_abs, maximum(diff)) + max_rel = max(max_rel, maximum(diff ./ max.(ref, eps()))) end + return max_abs, max_rel end -@testset "Compare GPU to CPU solution" begin - for sol_d in [sol_d_dns, sol_d_csc, sol_d_csr] - for t in tspan[begin]:0.1:tspan[end] - @test Vector(sol_d(t)) ≈ sol(t) atol=1e-5 +function _printval(val; threshold = 1e-3) + s = @sprintf("%.2e", val) + if val > threshold + printstyled(s; color = :red) + else + print(s) + end +end + +function debug_gpu_dae(jac_prototype_d, solver, name) + # CPU: this solver vs Rodas5P reference + sol_cpu = solve(prob, solver) + cpu_abs, cpu_rel = _maxerrs(sol_cpu, sol) + + # GPU solve + odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d) + prob_d = ODEProblem(odef_d, u0_d, tspan, p_d) + sol_d = solve(prob_d, solver) + + # GPU vs CPU same solver + gpu_abs, gpu_rel = _maxerrs(sol_d, sol_cpu) + + # Print row + print(rpad(name, 30)) + _printval(cpu_abs); print(" ") + _printval(cpu_rel); print(" ") + _printval(gpu_abs); print(" ") + _printval(gpu_rel) + println() +end + +using Printf + +println(rpad("", 30), "cpu_abs cpu_rel gpu_abs gpu_rel") +println("-"^72) + +for (sn, sv) in solvers, (jn, jp) in jac_prots + label = "$sn / $jn" + try + debug_gpu_dae(jp, sv, label) + catch e + printstyled(rpad(label, 30), "ERROR: ", sprint(showerror, e; context=:limit=>true), "\n"; color = :yellow) + end +end + +println("-"^72) + +#= Uncomment for actual testing +@testset "End-to-end GPU compat of mass matrix DAE solvers" begin + for (sn, sv) in solvers + @testset "GPU DAE: $sn" begin + for (jn, jp) in jac_prots + @testset "Jacboain prototype: $jn" begin + test_gpu_dae(jp, sv) + end + end end end end +=# From 86dcda05dfa5e4628b4a0d1911dea32ffac88019 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Mon, 23 Feb 2026 11:30:31 +0100 Subject: [PATCH 06/11] improve test script --- test/gpu/simple_dae.jl | 329 +++++++++++++++++++++++------------------ 1 file changed, 185 insertions(+), 144 deletions(-) diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl index 0849fc604af..d43c93b2731 100644 --- a/test/gpu/simple_dae.jl +++ b/test/gpu/simple_dae.jl @@ -35,122 +35,159 @@ jac_prototype = sparse(map(x -> iszero(x) ? 0.0 : 1.0, p)) u0 = [1.0, 1.0, 0.5, 0.5] # force init tspan = (0.0, 5.0) -# CPU reference solution +# CPU reference solution (Rodas5P) odef = ODEFunction(dae!, mass_matrix = mass_matrix, jac_prototype = jac_prototype) prob = ODEProblem(odef, u0, tspan, p) -sol = solve(prob, Rodas5P()) +sol_ref = solve(prob, Rodas5P()) -# GPU data -- we use F64 for higher accuracy for comparision +# GPU data -- we use F64 for higher accuracy for comparison u0_d = adapt(CuArray{Float64}, u0) p_d = adapt(CuArray{Float64}, p) -# dense or spares mass matrix does not work yet! +# dense or sparse mass matrix does not work yet! mass_matrix_d = cu(mass_matrix) -function test_gpu_dae(jac_prototype_d, solver) - sol_ref = solve(prob, solver) - odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d) - prob_d = ODEProblem(odef_d, u0_d, tspan, p_d) - sol_d = solve(prob_d, solver) - - for t in sol_d.t - u = Vector(sol_d(t)) - @test isapprox(u[1] + u[2], u[3]; atol = 1.0e-6) - @test isapprox(-u[1] + u[2], u[4]; atol = 1.0e-6) - end - - max_abserr = 0.0 - max_relerr = 0.0 - for t in tspan[begin]:0.1:tspan[end] - diff = abs.(Vector(sol_d(t)) - sol_ref(t)) - ref = abs.(sol_ref(t)) - max_abserr = max(max_abserr, maximum(diff)) - max_relerr = max(max_relerr, maximum(diff ./ max.(ref, eps()))) - end - cond = (max_abserr < 1e-5 || max_relerr < 1e-5) - cond || println("Solver $sol abstol=$max_abserr retlol=$max_relerr") - @test (max_abserr < 1e-5 || max_relerr < 1e-5) -end - -# Jacobian prototype options +# ── Jacobian prototype options ──────────────────────────────────────────────── jac_prots = [ "none" => nothing, "CSC" => CUDA.CUSPARSE.CuSparseMatrixCSC(jac_prototype), "CSR" => CUDA.CUSPARSE.CuSparseMatrixCSR(jac_prototype), ] -# Solver options +# ── Solver definitions ──────────────────────────────────────────────────────── +# +# Each entry: solver => (; tol overrides...) +# method_tol = (; atol, rtol) – cpu solver vs Rodas5P reference (some methods are poor fits) +# gpu_tol = (; atol, rtol) – gpu vs cpu (none jac_prot) +# csc_tol = (; atol, rtol) – gpu vs cpu (CSC jac_prot) +# csr_tol = (; atol, rtol) – gpu vs cpu (CSR jac_prot) +# Only specify fields that deviate from the defaults. + +const DEFAULT_METHOD_TOL = (; atol = 1e-4, rtol = 1e-4) # generous: we only care about GPU vs CPU match +const DEFAULT_GPU_TOL = (; atol = 1e-4, rtol = 1e-4) + solvers = [ - # Rosenbrock (diagonal mass matrix only) - "Rosenbrock23" => Rosenbrock23(), # ✅ - "Rosenbrock32" => Rosenbrock32(), # 📉 poor fit for this DAE, 💥 CSC catastrophic - # Rosenbrock 2nd order - "ROS2" => ROS2(), # ✅ - "ROS2PR" => ROS2PR(), # ✅ - "ROS2S" => ROS2S(), # ✅ - # Rosenbrock 3rd order - "ROS3" => ROS3(), # ✅ - "ROS3PR" => ROS3PR(), # 📉 poor fit for this DAE - "ROS3PRL" => ROS3PRL(), # ⚠️ CSC accuracy loss - "ROS3PRL2" => ROS3PRL2(), # ⚠️ CSC accuracy loss - "ROS3P" => ROS3P(), # 📉 poor fit for this DAE - "Rodas3" => Rodas3(), # ⚠️ CSC accuracy loss - # "Rodas23W" => Rodas23W(), # 🚫 scalar indexing, requires large cahnges to `calculate_interpoldiff!` - # "Rodas3P" => Rodas3P(), # 🚫 scalar indexing - "Scholz4_7" => Scholz4_7(), # ✅ - # Rosenbrock 4th order - "ROS34PW1a" => ROS34PW1a(), # 📉 poor fit for this DAE - "ROS34PW1b" => ROS34PW1b(), # 📉 poor fit for this DAE - "ROS34PW2" => ROS34PW2(), # ⚠️ CSC accuracy loss - "ROS34PW3" => ROS34PW3(), # ✅ - "ROS34PRw" => ROS34PRw(), # ✅ - "RosShamp4" => RosShamp4(), # ✅ - "Veldd4" => Veldd4(), # ✅ - "Velds4" => Velds4(), # ✅ - "GRK4T" => GRK4T(), # ✅ - "GRK4A" => GRK4A(), # ✅ - "Ros4LStab" => Ros4LStab(), # ✅ - "Rodas4" => Rodas4(), # ✅ - "Rodas42" => Rodas42(), # ✅ - "Rodas4P" => Rodas4P(), # ✅ - "Rodas4P2" => Rodas4P2(), # ✅ - "ROK4a" => ROK4a(), # ✅ - # Rosenbrock 5th order - "Rodas5" => Rodas5(), # ✅ - "Rodas5P" => Rodas5P(), # ✅ - "Rodas5Pe" => Rodas5Pe(), # ✅ - "Rodas5Pr" => Rodas5Pr(), # ✅ - # Rosenbrock 6th order - "Rodas6P" => Rodas6P(), # ✅ - # SDIRK (don't include fixed time step which need explicit dt) - "ImplicitEuler" => ImplicitEuler(), # ✅ - "Trapezoid" => Trapezoid(), # ✅ - "SDIRK2" => SDIRK2(), # ✅ - "Cash4" => Cash4(), # ⚠️ CSC accuracy loss - "Hairer4" => Hairer4(), # ⚠️ CSC accuracy loss - "Hairer42" => Hairer42(), # ⚠️ CSC accuracy loss - # BDF - "ABDF2" => ABDF2(), # ⚠️ CSC accuracy loss (💥 catastrophic) - "QNDF1" => QNDF1(), # ✅ - "QNDF2" => QNDF2(), # ⚠️ CSC accuracy loss - # "QNDF" => QNDF(), # 🔧 DeviceMemory error in LinAlg - "QBDF1" => QBDF1(), # ✅ - "QBDF2" => QBDF2(), # ⚠️ accuracy loss (all jac_prots) - # "QBDF" => QBDF(), # 🔧 DeviceMemory error in LinAlg - # "FBDF" => FBDF(), # 🚫 scalar indexing -> needs extensive work on reinitFBDF! - # FIRK -> all need subtential changes to `perform_step!` for FIRK methods - # "RadauIIA3" => RadauIIA3(), # 🚫 scalar indexing, ComplexF64 sparse unsupported - # "RadauIIA5" => RadauIIA5(), # 🚫 scalar indexing, ComplexF64 sparse unsupported - # "RadauIIA9" => RadauIIA9(), # 🚫 scalar indexing, ComplexF64 sparse unsupported - # "AdaptiveRadau" => AdaptiveRadau(), # 🚫 scalar indexing, ComplexF64 sparse unsupported + # ── Rosenbrock 2nd order ── + Rosenbrock23() => (; + csc_tol = (; atol = 2e-4, rtol = 5e-4), + ), + Rosenbrock32() => (; + csc_tol = (; atol = Inf, rtol = Inf), + ), + ROS2() => (;), + ROS2PR() => (; + csc_tol = (; atol = 3e-4, rtol = 4e-4), + ), + ROS2S() => (; + csc_tol = (; atol = 7e-4, rtol = 1.2e-3), + ), + # ── Rosenbrock 3rd order ── + ROS3() => (;), + ROS3PR() => (;), + ROS3PRL() => (; + csc_tol = (; atol = 2e-3, rtol = 4e-3), + ), + ROS3PRL2() => (; + csc_tol = (; atol = 3e-3, rtol = 4e-3), + ), + ROS3P() => (;), + Rodas3() => (; + csc_tol = (; atol = 2e-3, rtol = 3e-3), + ), + # Rodas23W() # 🚫 scalar indexing, requires large changes to `calculate_interpoldiff!` + # Rodas3P() # 🚫 scalar indexing + Scholz4_7() => (;), + # ── Rosenbrock 4th order ── + ROS34PW1a() => (;), + ROS34PW1b() => (;), + ROS34PW2() => (; + csc_tol = (; atol = 1.2e-3, rtol = 2.2e-3), + ), + ROS34PW3() => (;), + ROS34PRw() => (; + csc_tol = (; atol = 6e-4, rtol = 1e-3), + ), + RosShamp4() => (;), + Veldd4() => (;), + Velds4() => (;), + GRK4T() => (;), + GRK4A() => (;), + Ros4LStab() => (;), + Rodas4() => (;), + Rodas42() => (; + csc_tol = (; atol = 2e-5, rtol = 1.3e-4), + ), + Rodas4P() => (; + csc_tol = (; atol = 9e-6, rtol = 1.3e-4), + ), + Rodas4P2() => (;), + ROK4a() => (;), + # ── Rosenbrock 5th order ── + Rodas5() => (;), + Rodas5P() => (;), + Rodas5Pe() => (;), + Rodas5Pr() => (;), + # ── Rosenbrock 6th order ── + Rodas6P() => (;), + # ── SDIRK (don't include fixed time step which need explicit dt) ── + ImplicitEuler() => (; + gpu_tol = (; atol = 1e-4, rtol = 4e-4), + csc_tol = (; atol = 6e-5, rtol = 3e-4), + ), + Trapezoid() => (;), + SDIRK2() => (; + csc_tol = (; atol = 1.5e-4, rtol = 2.1e-4), + ), + Cash4() => (; + csc_tol = (; atol = 5e-3, rtol = 8e-3), + ), + Hairer4() => (; + csc_tol = (; atol = 8e-3, rtol = 6e-2), + ), + Hairer42() => (; + csc_tol = (; atol = 7e-3, rtol = 5e-2), + ), + # ── BDF ── + ABDF2() => (; + csc_tol = (; atol = Inf, rtol = Inf), # 💥 CSC catastrophic + ), + QNDF1() => (;), + QNDF2() => (; + csc_tol = (; atol = 4e-3, rtol = 5e-3), + ), + # QNDF() # 🔧 DeviceMemory error in LinAlg + QBDF1() => (;), + QBDF2() => (; + gpu_tol = (; atol = 2e-3, rtol = 2e-3), + csc_tol = (; atol = 3e-3, rtol = 6e-3), + csr_tol = (; atol = 2e-3, rtol = 2e-3), + ), + # QBDF() # 🔧 DeviceMemory error in LinAlg + # FBDF() # 🚫 scalar indexing -> needs extensive work on reinitFBDF! + # ── FIRK -> all need substantial changes to `perform_step!` for FIRK methods ── + # RadauIIA3() # 🚫 scalar indexing, ComplexF64 sparse unsupported + # RadauIIA5() # 🚫 scalar indexing, ComplexF64 sparse unsupported + # RadauIIA9() # 🚫 scalar indexing, ComplexF64 sparse unsupported + # AdaptiveRadau() # 🚫 scalar indexing, ComplexF64 sparse unsupported ] -function _maxerrs(sol_a, sol_b) +# ── Helpers ─────────────────────────────────────────────────────────────────── + +function _get_tol(overrides, jac_name) + _tol_keys = Dict("none" => :gpu_tol, "CSC" => :csc_tol, "CSR" => :csr_tol) + key = _tol_keys[jac_name] + hasproperty(overrides, key) && return getproperty(overrides, key) + # gpu_tol acts as default for all GPU combos + hasproperty(overrides, :gpu_tol) && return overrides.gpu_tol + return DEFAULT_GPU_TOL +end + +function maxerrs(sol_a, sol_b) max_abs = 0.0 max_rel = 0.0 for t in tspan[begin]:0.1:tspan[end] - a = isa(sol_a(t), CuArray) ? Vector(sol_a(t)) : Vector(sol_a(t)) - b = isa(sol_b(t), CuArray) ? Vector(sol_b(t)) : Vector(sol_b(t)) + a = Vector(sol_a(t)) + b = Vector(sol_b(t)) diff = abs.(a - b) ref = abs.(b) max_abs = max(max_abs, maximum(diff)) @@ -159,63 +196,67 @@ function _maxerrs(sol_a, sol_b) return max_abs, max_rel end -function _printval(val; threshold = 1e-3) - s = @sprintf("%.2e", val) - if val > threshold - printstyled(s; color = :red) - else - print(s) - end -end +function run_dae_tests() + results = Any[] -function debug_gpu_dae(jac_prototype_d, solver, name) - # CPU: this solver vs Rodas5P reference - sol_cpu = solve(prob, solver) - cpu_abs, cpu_rel = _maxerrs(sol_cpu, sol) - - # GPU solve - odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jac_prototype_d) - prob_d = ODEProblem(odef_d, u0_d, tspan, p_d) - sol_d = solve(prob_d, solver) - - # GPU vs CPU same solver - gpu_abs, gpu_rel = _maxerrs(sol_d, sol_cpu) - - # Print row - print(rpad(name, 30)) - _printval(cpu_abs); print(" ") - _printval(cpu_rel); print(" ") - _printval(gpu_abs); print(" ") - _printval(gpu_rel) - println() -end + for (sv, overrides) in solvers, (jn, jp) in jac_prots + sn = string(nameof(typeof(sv))) + (; atol, rtol) = _get_tol(overrides, jn) -using Printf + # CPU: this solver vs reference + sol_cpu = solve(prob, sv) + cpu_abs, cpu_rel = maxerrs(sol_cpu, sol_ref) -println(rpad("", 30), "cpu_abs cpu_rel gpu_abs gpu_rel") -println("-"^72) + # GPU solve + odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jp) + prob_d = ODEProblem(odef_d, u0_d, tspan, p_d) + sol_d = solve(prob_d, sv) -for (sn, sv) in solvers, (jn, jp) in jac_prots - label = "$sn / $jn" - try - debug_gpu_dae(jp, sv, label) - catch e - printstyled(rpad(label, 30), "ERROR: ", sprint(showerror, e; context=:limit=>true), "\n"; color = :yellow) + # GPU vs CPU (same solver) + gpu_abs, gpu_rel = maxerrs(sol_d, sol_cpu) + + passed = (gpu_abs < atol) && (gpu_rel < rtol) + push!(results, (; solver = sn, jac = jn, cpu_abs, cpu_rel, gpu_abs, gpu_rel, + tol_abs = atol, tol_rel = rtol, passed, error = "")) + @test passed end + + return results end -println("-"^72) +function show_results(results) + function _fmt(val; threshold = 1e-3) + isnan(val) && return " --- " + s = @sprintf("%.2e", val) + if val > threshold + return "\e[31m$s\e[0m" + end + return s + end + + println(rpad("Solver / Jac", 30), "cpu_abs cpu_rel gpu_abs gpu_rel status") + println("-"^85) -#= Uncomment for actual testing -@testset "End-to-end GPU compat of mass matrix DAE solvers" begin - for (sn, sv) in solvers - @testset "GPU DAE: $sn" begin - for (jn, jp) in jac_prots - @testset "Jacboain prototype: $jn" begin - test_gpu_dae(jp, sv) - end + for r in results + label = rpad("$(r.solver) / $(r.jac)", 30) + if r.error != "" + printstyled(label, "ERROR: ", r.error, "\n"; color = :yellow) + else + print(label) + print(_fmt(r.cpu_abs), " ", _fmt(r.cpu_rel), " ") + print(_fmt(r.gpu_abs), " ", _fmt(r.gpu_rel), " ") + if r.passed + printstyled("✓\n"; color = :green) + else + printstyled("✗\n"; color = :red) end end end + println("-"^85) end -=# + +@testset "GPU DAE solver compatibility" begin + results = run_dae_tests() +end + +show_results(results) From f00915691c51963e65d3bf3de12223e968973e2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Mon, 23 Feb 2026 12:21:36 +0100 Subject: [PATCH 07/11] improve test script --- test/gpu/simple_dae.jl | 138 +++++++++++++++++++++-------------------- 1 file changed, 71 insertions(+), 67 deletions(-) diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl index d43c93b2731..83b13c734f0 100644 --- a/test/gpu/simple_dae.jl +++ b/test/gpu/simple_dae.jl @@ -10,6 +10,8 @@ using SparseArrays using Test using CUDSS using Printf +using OrdinaryDiffEqNonlinearSolve.LinearSolve: KrylovJL_GMRES + #= du[1] = -u[1] @@ -39,6 +41,7 @@ tspan = (0.0, 5.0) odef = ODEFunction(dae!, mass_matrix = mass_matrix, jac_prototype = jac_prototype) prob = ODEProblem(odef, u0, tspan, p) sol_ref = solve(prob, Rodas5P()) +sol_ref_krylov = solve(prob, Rodas5P(linsolve=KrylovJL_GMRES())) # GPU data -- we use F64 for higher accuracy for comparison u0_d = adapt(CuArray{Float64}, u0) @@ -68,107 +71,102 @@ const DEFAULT_GPU_TOL = (; atol = 1e-4, rtol = 1e-4) solvers = [ # ── Rosenbrock 2nd order ── - Rosenbrock23() => (; + Rosenbrock23 => (; csc_tol = (; atol = 2e-4, rtol = 5e-4), ), - Rosenbrock32() => (; + Rosenbrock32 => (; csc_tol = (; atol = Inf, rtol = Inf), ), - ROS2() => (;), - ROS2PR() => (; + ROS2 => (;), + ROS2PR => (; csc_tol = (; atol = 3e-4, rtol = 4e-4), ), - ROS2S() => (; + ROS2S => (; csc_tol = (; atol = 7e-4, rtol = 1.2e-3), ), # ── Rosenbrock 3rd order ── - ROS3() => (;), - ROS3PR() => (;), - ROS3PRL() => (; + ROS3 => (;), + ROS3PR => (;), + ROS3PRL => (; csc_tol = (; atol = 2e-3, rtol = 4e-3), ), - ROS3PRL2() => (; + ROS3PRL2 => (; csc_tol = (; atol = 3e-3, rtol = 4e-3), ), - ROS3P() => (;), - Rodas3() => (; + ROS3P => (;), + Rodas3 => (; csc_tol = (; atol = 2e-3, rtol = 3e-3), ), - # Rodas23W() # 🚫 scalar indexing, requires large changes to `calculate_interpoldiff!` - # Rodas3P() # 🚫 scalar indexing - Scholz4_7() => (;), + # Rodas23W() # scalar indexing, requires large changes to `calculate_interpoldiff!` + # Rodas3P() # scalar indexing + Scholz4_7 => (;), # ── Rosenbrock 4th order ── - ROS34PW1a() => (;), - ROS34PW1b() => (;), - ROS34PW2() => (; + ROS34PW1a => (;), + ROS34PW1b => (;), + ROS34PW2 => (; csc_tol = (; atol = 1.2e-3, rtol = 2.2e-3), ), - ROS34PW3() => (;), - ROS34PRw() => (; + ROS34PW3 => (;), + ROS34PRw => (; csc_tol = (; atol = 6e-4, rtol = 1e-3), ), - RosShamp4() => (;), - Veldd4() => (;), - Velds4() => (;), - GRK4T() => (;), - GRK4A() => (;), - Ros4LStab() => (;), - Rodas4() => (;), - Rodas42() => (; - csc_tol = (; atol = 2e-5, rtol = 1.3e-4), - ), - Rodas4P() => (; - csc_tol = (; atol = 9e-6, rtol = 1.3e-4), - ), - Rodas4P2() => (;), - ROK4a() => (;), + RosShamp4 => (;), + Veldd4 => (;), + Velds4 => (;), + GRK4T => (;), + GRK4A => (;), + Ros4LStab => (;), + Rodas4 => (;), + Rodas42 => (;), + Rodas4P => (;), + Rodas4P2 => (;), + ROK4a => (;), # ── Rosenbrock 5th order ── - Rodas5() => (;), - Rodas5P() => (;), - Rodas5Pe() => (;), - Rodas5Pr() => (;), + Rodas5 => (;), + Rodas5P => (;), + Rodas5Pe => (;), + Rodas5Pr => (;), # ── Rosenbrock 6th order ── - Rodas6P() => (;), + Rodas6P => (;), # ── SDIRK (don't include fixed time step which need explicit dt) ── - ImplicitEuler() => (; - gpu_tol = (; atol = 1e-4, rtol = 4e-4), - csc_tol = (; atol = 6e-5, rtol = 3e-4), + ImplicitEuler => (;), + Trapezoid => (; + csc_tol = (; atol = 8e-3, rtol = 2.5e-2), ), - Trapezoid() => (;), - SDIRK2() => (; - csc_tol = (; atol = 1.5e-4, rtol = 2.1e-4), + SDIRK2 => (; + csc_tol = (; atol = 1.5e-4, rtol = 3.5e-4), ), - Cash4() => (; + Cash4 => (; csc_tol = (; atol = 5e-3, rtol = 8e-3), ), - Hairer4() => (; + Hairer4 => (; csc_tol = (; atol = 8e-3, rtol = 6e-2), ), - Hairer42() => (; + Hairer42 => (; csc_tol = (; atol = 7e-3, rtol = 5e-2), ), # ── BDF ── - ABDF2() => (; - csc_tol = (; atol = Inf, rtol = Inf), # 💥 CSC catastrophic + ABDF2 => (; + csc_tol = (; atol = 2e-4, rtol = 7e-4), ), - QNDF1() => (;), - QNDF2() => (; + QNDF1 => (;), + QNDF2 => (; csc_tol = (; atol = 4e-3, rtol = 5e-3), ), # QNDF() # 🔧 DeviceMemory error in LinAlg - QBDF1() => (;), - QBDF2() => (; + QBDF1 => (;), + QBDF2 => (; gpu_tol = (; atol = 2e-3, rtol = 2e-3), - csc_tol = (; atol = 3e-3, rtol = 6e-3), + csc_tol = (; atol = 4e-3, rtol = 7e-3), csr_tol = (; atol = 2e-3, rtol = 2e-3), ), - # QBDF() # 🔧 DeviceMemory error in LinAlg - # FBDF() # 🚫 scalar indexing -> needs extensive work on reinitFBDF! + # QBDF() # DeviceMemory error in LinAlg + # FBDF() # scalar indexing -> needs extensive work on reinitFBDF! # ── FIRK -> all need substantial changes to `perform_step!` for FIRK methods ── - # RadauIIA3() # 🚫 scalar indexing, ComplexF64 sparse unsupported - # RadauIIA5() # 🚫 scalar indexing, ComplexF64 sparse unsupported - # RadauIIA9() # 🚫 scalar indexing, ComplexF64 sparse unsupported - # AdaptiveRadau() # 🚫 scalar indexing, ComplexF64 sparse unsupported + # RadauIIA3() # scalar indexing, ComplexF64 sparse unsupported + # RadauIIA5() # scalar indexing, ComplexF64 sparse unsupported + # RadauIIA9() # scalar indexing, ComplexF64 sparse unsupported + # AdaptiveRadau() # scalar indexing, ComplexF64 sparse unsupported ] # ── Helpers ─────────────────────────────────────────────────────────────────── @@ -200,22 +198,28 @@ function run_dae_tests() results = Any[] for (sv, overrides) in solvers, (jn, jp) in jac_prots - sn = string(nameof(typeof(sv))) + println("Test $sv with prototype $jn") + sn = string(sv) (; atol, rtol) = _get_tol(overrides, jn) + # CSC will allways fall back to krylov so the ref solution should do to + krylov = (jn == "CSC") + _sol_ref = krylov ? sol_ref_krylov : sol_ref + # CPU: this solver vs reference - sol_cpu = solve(prob, sv) - cpu_abs, cpu_rel = maxerrs(sol_cpu, sol_ref) + cpu_alg = krylov ? sv(linsolve=KrylovJL_GMRES()) : sv() + sol_cpu = solve(prob, cpu_alg) + cpu_abs, cpu_rel = maxerrs(sol_cpu, _sol_ref) # GPU solve odef_d = ODEFunction(dae!, mass_matrix = mass_matrix_d, jac_prototype = jp) prob_d = ODEProblem(odef_d, u0_d, tspan, p_d) - sol_d = solve(prob_d, sv) + sol_d = solve(prob_d, sv()) # GPU vs CPU (same solver) gpu_abs, gpu_rel = maxerrs(sol_d, sol_cpu) - passed = (gpu_abs < atol) && (gpu_rel < rtol) + passed = (gpu_abs < atol) || (gpu_rel < rtol) push!(results, (; solver = sn, jac = jn, cpu_abs, cpu_rel, gpu_abs, gpu_rel, tol_abs = atol, tol_rel = rtol, passed, error = "")) @test passed @@ -256,7 +260,7 @@ function show_results(results) end @testset "GPU DAE solver compatibility" begin + global results results = run_dae_tests() end - show_results(results) From fcf7d911de7e4de597cfe6ce2740f177b9b691ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Mon, 23 Feb 2026 12:42:47 +0100 Subject: [PATCH 08/11] improve test script --- test/gpu/simple_dae.jl | 62 ++++++++++++++++++++++++++++++------------ 1 file changed, 44 insertions(+), 18 deletions(-) diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl index 83b13c734f0..adf2c54d8e6 100644 --- a/test/gpu/simple_dae.jl +++ b/test/gpu/simple_dae.jl @@ -59,14 +59,15 @@ jac_prots = [ # ── Solver definitions ──────────────────────────────────────────────────────── # -# Each entry: solver => (; tol overrides...) -# method_tol = (; atol, rtol) – cpu solver vs Rodas5P reference (some methods are poor fits) -# gpu_tol = (; atol, rtol) – gpu vs cpu (none jac_prot) -# csc_tol = (; atol, rtol) – gpu vs cpu (CSC jac_prot) -# csr_tol = (; atol, rtol) – gpu vs cpu (CSR jac_prot) +# Each entry: SolverType => (; tol overrides...) +# method_tol = (; atol, rtol) – cpu solver vs Rodas5P ref (all jac); some methods are poor fits for DAEs +# method_csc_tol = (; atol, rtol) – cpu solver vs Rodas5P ref (CSC/Krylov path only) +# gpu_tol = (; atol, rtol) – gpu vs cpu (none/CSR fallback) +# csc_tol = (; atol, rtol) – gpu vs cpu (CSC jac_prot) +# csr_tol = (; atol, rtol) – gpu vs cpu (CSR jac_prot) # Only specify fields that deviate from the defaults. -const DEFAULT_METHOD_TOL = (; atol = 1e-4, rtol = 1e-4) # generous: we only care about GPU vs CPU match +const DEFAULT_METHOD_TOL = (; atol = 1e-2, rtol = 1e-2) const DEFAULT_GPU_TOL = (; atol = 1e-4, rtol = 1e-4) solvers = [ @@ -75,7 +76,8 @@ solvers = [ csc_tol = (; atol = 2e-4, rtol = 5e-4), ), Rosenbrock32 => (; - csc_tol = (; atol = Inf, rtol = Inf), + method_tol = (; atol = 2e-2, rtol = 1.5), + csc_tol = (; atol = Inf, rtol = Inf), ), ROS2 => (;), ROS2PR => (; @@ -86,14 +88,18 @@ solvers = [ ), # ── Rosenbrock 3rd order ── ROS3 => (;), - ROS3PR => (;), + ROS3PR => (; + method_tol = (; atol = 0.25, rtol = 15.0), + ), ROS3PRL => (; csc_tol = (; atol = 2e-3, rtol = 4e-3), ), ROS3PRL2 => (; csc_tol = (; atol = 3e-3, rtol = 4e-3), ), - ROS3P => (;), + ROS3P => (; + method_tol = (; atol = 0.25, rtol = 15.0), + ), Rodas3 => (; csc_tol = (; atol = 2e-3, rtol = 3e-3), ), @@ -101,8 +107,12 @@ solvers = [ # Rodas3P() # scalar indexing Scholz4_7 => (;), # ── Rosenbrock 4th order ── - ROS34PW1a => (;), - ROS34PW1b => (;), + ROS34PW1a => (; + method_tol = (; atol = 0.25, rtol = 5.0), + ), + ROS34PW1b => (; + method_tol = (; atol = 0.25, rtol = 5.0), + ), ROS34PW2 => (; csc_tol = (; atol = 1.2e-3, rtol = 2.2e-3), ), @@ -147,14 +157,17 @@ solvers = [ ), # ── BDF ── ABDF2 => (; - csc_tol = (; atol = 2e-4, rtol = 7e-4), + method_csc_tol = (; atol = Inf, rtol = Inf), # ABDF2 + Krylov diverges vs Rodas5P + Krylov + csc_tol = (; atol = 2e-4, rtol = 7e-4), ), QNDF1 => (;), QNDF2 => (; csc_tol = (; atol = 4e-3, rtol = 5e-3), ), # QNDF() # 🔧 DeviceMemory error in LinAlg - QBDF1 => (;), + QBDF1 => (; + method_tol = (; atol = 2e-2, rtol = 0.2), + ), QBDF2 => (; gpu_tol = (; atol = 2e-3, rtol = 2e-3), csc_tol = (; atol = 4e-3, rtol = 7e-3), @@ -171,6 +184,12 @@ solvers = [ # ── Helpers ─────────────────────────────────────────────────────────────────── +function _get_method_tol(overrides, jac_name) + jac_name == "CSC" && hasproperty(overrides, :method_csc_tol) && return overrides.method_csc_tol + hasproperty(overrides, :method_tol) && return overrides.method_tol + return DEFAULT_METHOD_TOL +end + function _get_tol(overrides, jac_name) _tol_keys = Dict("none" => :gpu_tol, "CSC" => :csc_tol, "CSR" => :csr_tol) key = _tol_keys[jac_name] @@ -200,7 +219,8 @@ function run_dae_tests() for (sv, overrides) in solvers, (jn, jp) in jac_prots println("Test $sv with prototype $jn") sn = string(sv) - (; atol, rtol) = _get_tol(overrides, jn) + gtol = _get_tol(overrides, jn) + mtol = _get_method_tol(overrides, jn) # CSC will allways fall back to krylov so the ref solution should do to krylov = (jn == "CSC") @@ -219,9 +239,11 @@ function run_dae_tests() # GPU vs CPU (same solver) gpu_abs, gpu_rel = maxerrs(sol_d, sol_cpu) - passed = (gpu_abs < atol) || (gpu_rel < rtol) + method_passed = (cpu_abs < mtol.atol) || (cpu_rel < mtol.rtol) + gpu_passed = (gpu_abs < gtol.atol) || (gpu_rel < gtol.rtol) + passed = method_passed && gpu_passed push!(results, (; solver = sn, jac = jn, cpu_abs, cpu_rel, gpu_abs, gpu_rel, - tol_abs = atol, tol_rel = rtol, passed, error = "")) + gtol, mtol, method_passed, gpu_passed, passed, error = "")) @test passed end @@ -247,12 +269,16 @@ function show_results(results) printstyled(label, "ERROR: ", r.error, "\n"; color = :yellow) else print(label) - print(_fmt(r.cpu_abs), " ", _fmt(r.cpu_rel), " ") + print(_fmt(r.cpu_abs, threshold=1e-2), " ", _fmt(r.cpu_rel, threshold=1e-2), " ") print(_fmt(r.gpu_abs), " ", _fmt(r.gpu_rel), " ") if r.passed printstyled("✓\n"; color = :green) + elseif !r.method_passed && !r.gpu_passed + printstyled("✗ method+gpu\n"; color = :red) + elseif !r.method_passed + printstyled("✗ method\n"; color = :red) else - printstyled("✗\n"; color = :red) + printstyled("✗ gpu\n"; color = :red) end end end From 7de1820cde5055d661a327a1c2fc1e956f5e2055 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Mon, 23 Feb 2026 12:49:18 +0100 Subject: [PATCH 09/11] adapt project.toml --- test/gpu/Project.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/gpu/Project.toml b/test/gpu/Project.toml index aecabf6d1aa..dd879bde104 100644 --- a/test/gpu/Project.toml +++ b/test/gpu/Project.toml @@ -3,14 +3,16 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66" DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e" -FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" +FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898" FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b" OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed" OrdinaryDiffEqBDF = "6ad6398a-0878-4a85-9266-38940aa047c8" +OrdinaryDiffEqFIRK = "5960d6e9-dd7a-4743-88e7-cf307b64f125" OrdinaryDiffEqNonlinearSolve = "127b3ac7-2247-4354-8eb6-78cf4e7c58e8" OrdinaryDiffEqRKIP = "a4daff8c-1d43-4ff3-8eff-f78720aeecdc" OrdinaryDiffEqRosenbrock = "43230ef6-c299-4910-a778-202eb28ce4ce" +OrdinaryDiffEqSDIRK = "2d112036-d095-4a1e-ab9a-08536f3ecdbf" RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd" SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462" SciMLOperators = "c0aeaf25-5076-4817-a8d5-81caf7dfa961" @@ -26,8 +28,8 @@ Adapt = "4" CUDA = "4, 5" ComponentArrays = "0.15" DiffEqBase = "6.182" -FastBroadcast = "0.3" FFTW = "1.8" +FastBroadcast = "0.3" FillArrays = "1" OrdinaryDiffEq = "6" OrdinaryDiffEqBDF = "1" From 503d2cd832a61f54cce0caf0976d5897b02a8525 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Mon, 23 Feb 2026 13:58:02 +0100 Subject: [PATCH 10/11] apply runic --- lib/OrdinaryDiffEqCore/src/misc_utils.jl | 4 +- test/gpu/simple_dae.jl | 114 ++++++++++++----------- 2 files changed, 61 insertions(+), 57 deletions(-) diff --git a/lib/OrdinaryDiffEqCore/src/misc_utils.jl b/lib/OrdinaryDiffEqCore/src/misc_utils.jl index 65a6e56c21b..75c75f3d636 100644 --- a/lib/OrdinaryDiffEqCore/src/misc_utils.jl +++ b/lib/OrdinaryDiffEqCore/src/misc_utils.jl @@ -146,8 +146,8 @@ function find_algebraic_vars_eqs(M::Diagonal) end function find_algebraic_vars_eqs(M::AbstractMatrix) - algebraic_vars = vec(all(iszero, M, dims=1)) - algebraic_eqs = vec(all(iszero, M, dims=2)) + algebraic_vars = vec(all(iszero, M, dims = 1)) + algebraic_eqs = vec(all(iszero, M, dims = 2)) return algebraic_vars, algebraic_eqs end diff --git a/test/gpu/simple_dae.jl b/test/gpu/simple_dae.jl index adf2c54d8e6..44803183385 100644 --- a/test/gpu/simple_dae.jl +++ b/test/gpu/simple_dae.jl @@ -41,7 +41,7 @@ tspan = (0.0, 5.0) odef = ODEFunction(dae!, mass_matrix = mass_matrix, jac_prototype = jac_prototype) prob = ODEProblem(odef, u0, tspan, p) sol_ref = solve(prob, Rodas5P()) -sol_ref_krylov = solve(prob, Rodas5P(linsolve=KrylovJL_GMRES())) +sol_ref_krylov = solve(prob, Rodas5P(linsolve = KrylovJL_GMRES())) # GPU data -- we use F64 for higher accuracy for comparison u0_d = adapt(CuArray{Float64}, u0) @@ -67,41 +67,41 @@ jac_prots = [ # csr_tol = (; atol, rtol) – gpu vs cpu (CSR jac_prot) # Only specify fields that deviate from the defaults. -const DEFAULT_METHOD_TOL = (; atol = 1e-2, rtol = 1e-2) -const DEFAULT_GPU_TOL = (; atol = 1e-4, rtol = 1e-4) +const DEFAULT_METHOD_TOL = (; atol = 1.0e-2, rtol = 1.0e-2) +const DEFAULT_GPU_TOL = (; atol = 1.0e-4, rtol = 1.0e-4) solvers = [ # ── Rosenbrock 2nd order ── Rosenbrock23 => (; - csc_tol = (; atol = 2e-4, rtol = 5e-4), + csc_tol = (; atol = 2.0e-4, rtol = 5.0e-4), ), Rosenbrock32 => (; - method_tol = (; atol = 2e-2, rtol = 1.5), - csc_tol = (; atol = Inf, rtol = Inf), + method_tol = (; atol = 2.0e-2, rtol = 1.5), + csc_tol = (; atol = Inf, rtol = Inf), ), ROS2 => (;), ROS2PR => (; - csc_tol = (; atol = 3e-4, rtol = 4e-4), + csc_tol = (; atol = 3.0e-4, rtol = 4.0e-4), ), - ROS2S => (; - csc_tol = (; atol = 7e-4, rtol = 1.2e-3), + ROS2S => (; + csc_tol = (; atol = 7.0e-4, rtol = 1.2e-3), ), # ── Rosenbrock 3rd order ── - ROS3 => (;), - ROS3PR => (; + ROS3 => (;), + ROS3PR => (; method_tol = (; atol = 0.25, rtol = 15.0), ), - ROS3PRL => (; - csc_tol = (; atol = 2e-3, rtol = 4e-3), + ROS3PRL => (; + csc_tol = (; atol = 2.0e-3, rtol = 4.0e-3), ), ROS3PRL2 => (; - csc_tol = (; atol = 3e-3, rtol = 4e-3), + csc_tol = (; atol = 3.0e-3, rtol = 4.0e-3), ), - ROS3P => (; + ROS3P => (; method_tol = (; atol = 0.25, rtol = 15.0), ), - Rodas3 => (; - csc_tol = (; atol = 2e-3, rtol = 3e-3), + Rodas3 => (; + csc_tol = (; atol = 2.0e-3, rtol = 3.0e-3), ), # Rodas23W() # scalar indexing, requires large changes to `calculate_interpoldiff!` # Rodas3P() # scalar indexing @@ -113,65 +113,65 @@ solvers = [ ROS34PW1b => (; method_tol = (; atol = 0.25, rtol = 5.0), ), - ROS34PW2 => (; + ROS34PW2 => (; csc_tol = (; atol = 1.2e-3, rtol = 2.2e-3), ), - ROS34PW3 => (;), - ROS34PRw => (; - csc_tol = (; atol = 6e-4, rtol = 1e-3), + ROS34PW3 => (;), + ROS34PRw => (; + csc_tol = (; atol = 6.0e-4, rtol = 1.0e-3), ), RosShamp4 => (;), - Veldd4 => (;), - Velds4 => (;), - GRK4T => (;), - GRK4A => (;), + Veldd4 => (;), + Velds4 => (;), + GRK4T => (;), + GRK4A => (;), Ros4LStab => (;), - Rodas4 => (;), - Rodas42 => (;), - Rodas4P => (;), - Rodas4P2 => (;), - ROK4a => (;), + Rodas4 => (;), + Rodas42 => (;), + Rodas4P => (;), + Rodas4P2 => (;), + ROK4a => (;), # ── Rosenbrock 5th order ── - Rodas5 => (;), - Rodas5P => (;), + Rodas5 => (;), + Rodas5P => (;), Rodas5Pe => (;), Rodas5Pr => (;), # ── Rosenbrock 6th order ── - Rodas6P => (;), + Rodas6P => (;), # ── SDIRK (don't include fixed time step which need explicit dt) ── ImplicitEuler => (;), Trapezoid => (; - csc_tol = (; atol = 8e-3, rtol = 2.5e-2), + csc_tol = (; atol = 8.0e-3, rtol = 2.5e-2), ), - SDIRK2 => (; + SDIRK2 => (; csc_tol = (; atol = 1.5e-4, rtol = 3.5e-4), ), - Cash4 => (; - csc_tol = (; atol = 5e-3, rtol = 8e-3), + Cash4 => (; + csc_tol = (; atol = 5.0e-3, rtol = 8.0e-3), ), - Hairer4 => (; - csc_tol = (; atol = 8e-3, rtol = 6e-2), + Hairer4 => (; + csc_tol = (; atol = 8.0e-3, rtol = 6.0e-2), ), - Hairer42 => (; - csc_tol = (; atol = 7e-3, rtol = 5e-2), + Hairer42 => (; + csc_tol = (; atol = 7.0e-3, rtol = 5.0e-2), ), # ── BDF ── ABDF2 => (; method_csc_tol = (; atol = Inf, rtol = Inf), # ABDF2 + Krylov diverges vs Rodas5P + Krylov - csc_tol = (; atol = 2e-4, rtol = 7e-4), + csc_tol = (; atol = 2.0e-4, rtol = 7.0e-4), ), QNDF1 => (;), QNDF2 => (; - csc_tol = (; atol = 4e-3, rtol = 5e-3), + csc_tol = (; atol = 4.0e-3, rtol = 5.0e-3), ), # QNDF() # 🔧 DeviceMemory error in LinAlg QBDF1 => (; - method_tol = (; atol = 2e-2, rtol = 0.2), + method_tol = (; atol = 2.0e-2, rtol = 0.2), ), QBDF2 => (; - gpu_tol = (; atol = 2e-3, rtol = 2e-3), - csc_tol = (; atol = 4e-3, rtol = 7e-3), - csr_tol = (; atol = 2e-3, rtol = 2e-3), + gpu_tol = (; atol = 2.0e-3, rtol = 2.0e-3), + csc_tol = (; atol = 4.0e-3, rtol = 7.0e-3), + csr_tol = (; atol = 2.0e-3, rtol = 2.0e-3), ), # QBDF() # DeviceMemory error in LinAlg # FBDF() # scalar indexing -> needs extensive work on reinitFBDF! @@ -222,12 +222,12 @@ function run_dae_tests() gtol = _get_tol(overrides, jn) mtol = _get_method_tol(overrides, jn) - # CSC will allways fall back to krylov so the ref solution should do to + # CSC will always fall back to krylov so the ref solution should do to krylov = (jn == "CSC") _sol_ref = krylov ? sol_ref_krylov : sol_ref # CPU: this solver vs reference - cpu_alg = krylov ? sv(linsolve=KrylovJL_GMRES()) : sv() + cpu_alg = krylov ? sv(linsolve = KrylovJL_GMRES()) : sv() sol_cpu = solve(prob, cpu_alg) cpu_abs, cpu_rel = maxerrs(sol_cpu, _sol_ref) @@ -240,10 +240,14 @@ function run_dae_tests() gpu_abs, gpu_rel = maxerrs(sol_d, sol_cpu) method_passed = (cpu_abs < mtol.atol) || (cpu_rel < mtol.rtol) - gpu_passed = (gpu_abs < gtol.atol) || (gpu_rel < gtol.rtol) + gpu_passed = (gpu_abs < gtol.atol) || (gpu_rel < gtol.rtol) passed = method_passed && gpu_passed - push!(results, (; solver = sn, jac = jn, cpu_abs, cpu_rel, gpu_abs, gpu_rel, - gtol, mtol, method_passed, gpu_passed, passed, error = "")) + push!( + results, (; + solver = sn, jac = jn, cpu_abs, cpu_rel, gpu_abs, gpu_rel, + gtol, mtol, method_passed, gpu_passed, passed, error = "", + ) + ) @test passed end @@ -251,7 +255,7 @@ function run_dae_tests() end function show_results(results) - function _fmt(val; threshold = 1e-3) + function _fmt(val; threshold = 1.0e-3) isnan(val) && return " --- " s = @sprintf("%.2e", val) if val > threshold @@ -269,7 +273,7 @@ function show_results(results) printstyled(label, "ERROR: ", r.error, "\n"; color = :yellow) else print(label) - print(_fmt(r.cpu_abs, threshold=1e-2), " ", _fmt(r.cpu_rel, threshold=1e-2), " ") + print(_fmt(r.cpu_abs, threshold = 1.0e-2), " ", _fmt(r.cpu_rel, threshold = 1.0e-2), " ") print(_fmt(r.gpu_abs), " ", _fmt(r.gpu_rel), " ") if r.passed printstyled("✓\n"; color = :green) @@ -282,7 +286,7 @@ function show_results(results) end end end - println("-"^85) + return println("-"^85) end @testset "GPU DAE solver compatibility" begin From 7d1d6a1d6d4980c3f1c8e66074baca42a450be2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hans=20W=C3=BCrfel?= Date: Mon, 23 Feb 2026 14:03:51 +0100 Subject: [PATCH 11/11] fix missing imports/packages --- .../test/algebraic_vars_detection_tests.jl | 1 + test/gpu/Project.toml | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl b/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl index 3b3b3caa1af..0c88682844c 100644 --- a/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl +++ b/lib/OrdinaryDiffEqCore/test/algebraic_vars_detection_tests.jl @@ -1,6 +1,7 @@ using Test using SparseArrays using OrdinaryDiffEqCore: find_algebraic_vars_eqs +using LinearAlgebra @testset "Sparse Algebraic Detection Performance" begin # Test 1: Correctness - results should match between sparse and dense methods diff --git a/test/gpu/Project.toml b/test/gpu/Project.toml index dd879bde104..478a27e51ad 100644 --- a/test/gpu/Project.toml +++ b/test/gpu/Project.toml @@ -1,6 +1,7 @@ [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba" +CUDSS = "45b445bb-4962-46a0-9369-b4df9d0f772e" ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66" DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e" FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341" @@ -25,7 +26,8 @@ OrdinaryDiffEqRosenbrock = {path = "../../lib/OrdinaryDiffEqRosenbrock"} [compat] Adapt = "4" -CUDA = "4, 5" +CUDA = "5" +CUDSS = "0.6.7" ComponentArrays = "0.15" DiffEqBase = "6.182" FFTW = "1.8" @@ -33,9 +35,11 @@ FastBroadcast = "0.3" FillArrays = "1" OrdinaryDiffEq = "6" OrdinaryDiffEqBDF = "1" +OrdinaryDiffEqFIRK = "1" OrdinaryDiffEqNonlinearSolve = "1" OrdinaryDiffEqRKIP = "1" OrdinaryDiffEqRosenbrock = "1" +OrdinaryDiffEqSDIRK = "1" RecursiveArrayTools = "3" SciMLBase = "2.99" SciMLOperators = "1.3"