SciML · ChrisRackauckas · Apr 12, 2026 · Apr 11, 2026 · Apr 11, 2026 · Apr 11, 2026
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -7,6 +7,7 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
 DiffEqCallbacks = "459566f4-90b8-5000-8ac3-15dfb0a30def"
 DiffEqNoiseProcess = "77a26b50-5914-5dd7-bc55-306e6241c503"
 DifferentialEquations = "0c46a032-eb83-5123-abaf-570d42b7fbaa"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
@@ -15,6 +16,7 @@ IterTools = "c8e1da08-722c-5040-9ed9-7db0dc04731e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 LuxCUDA = "d0bbae9a-e099-4d5b-a835-1c6931763bda"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+Mooncake = "da2b9cff-9c12-43a0-ae48-6db2b0edb7d6"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 OptimizationOptimJL = "36348300-93cb-4f02-beb5-3c3902f8871e"
 OptimizationOptimisers = "42dfb2eb-d2b4-4451-abcd-913932933ac1"
@@ -35,13 +37,14 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]
 Calculus = "0.5"
-ComponentArrays = "0.15"
+ComponentArrays = "0.15.34"
 DataInterpolations = "3.10, 4, 5, 6, 7, 8"
 DelayDiffEq = "5"
 DelimitedFiles = "1"
 DiffEqCallbacks = "2.24, 3, 4"
 DiffEqNoiseProcess = "5.14"
 DifferentialEquations = "7"
+DifferentiationInterface = "0.6, 0.7"
 Documenter = "1"
 Enzyme = "0.12, 0.13"
 Flux = "0.14, 0.15, 0.16"
@@ -50,6 +53,7 @@ IterTools = "1"
 Lux = "1"
 LuxCUDA = "0.3"
 MLUtils = "0.4"
+Mooncake = "0.5"
 Optimization = "3.9, 4, 5"
 OptimizationOptimJL = "0.2, 0.3, 0.4"
 OptimizationOptimisers = "0.2, 0.3"

diff --git a/docs/src/Benchmark.md b/docs/src/Benchmark.md
@@ -40,7 +40,8 @@ Quick summary:
 import OrdinaryDiffEq as ODE
 import Lux
 import SciMLSensitivity as SMS
-import Zygote
+import Mooncake
+import DifferentiationInterface as DI
 import BenchmarkTools
 import Random
 import ComponentArrays as CA
@@ -80,7 +81,9 @@ for sensealg in (SMS.InterpolatingAdjoint(autojacvec = SMS.ZygoteVJP()),
         return loss
     end
 
-    t = BenchmarkTools.@belapsed Zygote.gradient($loss_neuralode, $u0, $ps, $st)
+    backend = DI.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
+    loss_ps = p -> loss_neuralode(u0, p, st)
+    t = BenchmarkTools.@belapsed DI.gradient($loss_ps, $backend, $ps)
     println("$(sensealg) took $(t)s")
 end
 

diff --git a/docs/src/examples/dde/delay_diffeq.md b/docs/src/examples/dde/delay_diffeq.md
@@ -10,6 +10,7 @@ import Optimization as OPT
 import SciMLSensitivity as SMS
 import OptimizationPolyalgorithms as OPA
 import DelayDiffEq as DDE
+import Mooncake
 
 # Define the same LV equation, but including a delay parameter
 function delay_lotka_volterra!(du, u, h, p, t)
@@ -35,7 +36,7 @@ prob_dde = DDE.DDEProblem(delay_lotka_volterra!, u0, h, (0.0, 10.0),
 
 function predict_dde(p)
     return Array(ODE.solve(prob_dde, DDE.MethodOfSteps(ODE.Tsit5());
-        u0, p, saveat = 0.1, sensealg = SMS.ReverseDiffAdjoint()))
+        u0, p, saveat = 0.1))
 end
 
 loss_dde(p) = sum(abs2, x - 1 for x in predict_dde(p))
@@ -50,14 +51,18 @@ callback = function (state, l; doplot = false)
     return false
 end
 
-adtype = OPT.AutoZygote()
+adtype = OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
 optf = OPT.OptimizationFunction((x, p) -> loss_dde(x), adtype)
 optprob = OPT.OptimizationProblem(optf, p)
 result_dde = OPT.solve(optprob, OPA.PolyOpt(); maxiters = 300, callback)
 ```
 
-Notice that we chose `sensealg = ReverseDiffAdjoint()` to utilize the ReverseDiff.jl
-reverse-mode to handle the delay differential equation.
+The `sensealg` is left at its default. For DDEs the automatic choice is
+[`ForwardDiffSensitivity`](@ref) (which differentiates through
+`MethodOfSteps` via dual numbers) for problems with fewer than 100
+parameters, and [`ReverseDiffAdjoint`](@ref) for larger ones —
+[continuous adjoints](@ref sensitivity_diffeq) are not yet defined for
+DDEs, so the discretize-then-optimize methods are the only option.
 
 We define a callback to display the solution at the current parameters for each step of the training:
 
@@ -76,7 +81,7 @@ end
 We use `Optimization.solve` to optimize the parameters for our loss function:
 
 ```@example dde
-adtype = OPT.AutoZygote()
+adtype = OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
 optf = OPT.OptimizationFunction((x, p) -> loss_dde(x), adtype)
 optprob = OPT.OptimizationProblem(optf, p)
 result_dde = OPT.solve(optprob, OPA.PolyOpt(); callback)

diff --git a/docs/src/examples/hybrid_jump/bouncing_ball.md b/docs/src/examples/hybrid_jump/bouncing_ball.md
@@ -13,6 +13,7 @@ import OptimizationPolyalgorithms as OPA
 import SciMLSensitivity as SMS
 import OrdinaryDiffEq as ODE
 import DiffEqCallbacks as DEC
+import Mooncake
 
 function f(du, u, p, t)
     du[1] = u[2]
@@ -44,11 +45,15 @@ the value 20:
 function loss(θ)
     sol = ODE.solve(prob, ODE.Tsit5(), p = [9.8, θ[1]]; callback)
     target = 20.0
-    abs2(sol[end][1] - target)
+    # Use `last(sol.u)[1]` instead of `sol[end][1]` — Mooncake's pullback for
+    # `getindex(::ODESolution, end)` currently has a `BoundsError` bug
+    # (`SciMLBaseMooncakeExt._scatter_pullback`).  Indexing the underlying
+    # `sol.u::Vector{Vector{Float64}}` directly avoids the bad path.
+    abs2(last(sol.u)[1] - target)
 end
 
 loss([0.8])
-adtype = OPT.AutoZygote()
+adtype = OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
 optf = OPT.OptimizationFunction((x, p) -> loss(x), adtype)
 optprob = OPT.OptimizationProblem(optf, [0.8])
 @time res = OPT.solve(optprob, OPA.PolyOpt(), maxiters = 300)

diff --git a/docs/src/examples/hybrid_jump/hybrid_diffeq.md b/docs/src/examples/hybrid_jump/hybrid_diffeq.md
@@ -12,6 +12,7 @@ import ComponentArrays as CA
 import Random
 import SciMLSensitivity as SMS
 import Lux
+import Mooncake
 import OrdinaryDiffEq as ODE
 import Plots
 import Optimization as OPT
@@ -50,9 +51,7 @@ cb = DEC.PresetTimeCallback(dosetimes, affect!, save_positions = (false, false))
 
 function predict_n_ode(p)
     _prob = ODE.remake(prob; p)
-    Array(ODE.solve(_prob, ODE.Tsit5(); u0 = z0, p, callback = cb, saveat = t,
-        sensealg = SMS.ReverseDiffAdjoint()))[1:2, :]
-    #Array(solve(prob,Tsit5();u0=z0,p,saveat=t))[1:2,:]
+    Array(ODE.solve(_prob, ODE.Tsit5(); u0 = z0, p, callback = cb, saveat = t))[1:2, :]
 end
 
 function loss_n_ode(p, _)
@@ -73,7 +72,10 @@ cba = function (state, l; doplot = false) #callback function to observe training
 end
 
 res = OPT.solve(
-    OPT.OptimizationProblem(OPT.OptimizationFunction(loss_n_ode, OPT.AutoZygote()),
+    OPT.OptimizationProblem(
+        OPT.OptimizationFunction(
+            loss_n_ode, OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
+        ),
         CA.ComponentArray(ps)),
     OPO.Adam(0.05); callback = cba, maxiters = 1000)
 ```

diff --git a/docs/src/examples/neural_ode/simplechains.md b/docs/src/examples/neural_ode/simplechains.md
@@ -1,5 +1,6 @@
 # Faster Neural Ordinary Differential Equations with SimpleChains
 
+
 [SimpleChains](https://github.com/PumasAI/SimpleChains.jl) has demonstrated performance boosts of ~5x and ~30x when compared to other mainstream deep learning frameworks like Pytorch for the training and evaluation in the specific case of small neural networks. For the nitty-gritty details, as well as, some SciML related videos around the need and applications of such a library, we can refer to this [blogpost](https://julialang.org/blog/2022/04/simple-chains/). As for doing Scientific Machine Learning, how do we even begin with training neural ODEs with any generic deep learning library?
 
 ## Training Data

diff --git a/docs/src/examples/ode/exogenous_input.md b/docs/src/examples/ode/exogenous_input.md
@@ -49,6 +49,7 @@ import OptimizationPolyalgorithms as OPA
 import OptimizationOptimisers as OPO
 import Plots
 import Random
+import Mooncake
 
 rng = Random.default_rng()
 tspan = (0.1, 10.0)
@@ -93,7 +94,7 @@ function loss(p)
     return sum(abs2.(y[1:N] .- sol')) / N
 end
 
-adtype = OPT.AutoZygote()
+adtype = OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
 optf = OPT.OptimizationFunction((x, p) -> loss(x), adtype)
 optprob = OPT.OptimizationProblem(optf, CA.ComponentArray{Float64}(p_model))
 

diff --git a/docs/src/examples/ode/second_order_adjoints.md b/docs/src/examples/ode/second_order_adjoints.md
@@ -13,6 +13,7 @@ optimization, while `KrylovTrustRegion` will utilize a Krylov-based method
 with Hessian-vector products (never forming the Hessian) for large parameter
 optimizations.
 
+
 ```@example secondorderadjoints
 import SciMLSensitivity as SMS
 import Lux
@@ -23,6 +24,7 @@ import OrdinaryDiffEq as ODE
 import Plots
 import Random
 import OptimizationOptimJL as OOJ
+import Mooncake
 
 u0 = Float32[2.0; 0.0]
 datasize = 30
@@ -83,13 +85,14 @@ callback = function (state, l; doplot = false)
     return l < 0.01
 end
 
-adtype = OPT.AutoZygote()
-optf = OPT.OptimizationFunction((x, p) -> loss_neuralode(x), adtype)
-
-optprob1 = OPT.OptimizationProblem(optf, ps)
+adtype1 = OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
+optf1 = OPT.OptimizationFunction((x, p) -> loss_neuralode(x), adtype1)
+optprob1 = OPT.OptimizationProblem(optf1, ps)
 pstart = OPT.solve(optprob1, OPO.Adam(0.01); callback, maxiters = 100).u
 
-optprob2 = OPT.OptimizationProblem(optf, pstart)
+adtype2 = OPT.AutoZygote()
+optf2 = OPT.OptimizationFunction((x, p) -> loss_neuralode(x), adtype2)
+optprob2 = OPT.OptimizationProblem(optf2, pstart)
 pmin = OPT.solve(optprob2, OOJ.NewtonTrustRegion(); callback, maxiters = 200)
 ```
 

diff --git a/docs/src/examples/ode/second_order_neural.md b/docs/src/examples/ode/second_order_neural.md
@@ -29,6 +29,7 @@ import OptimizationOptimisers as OPO
 import RecursiveArrayTools
 import Random
 import ComponentArrays as CA
+import Mooncake
 
 u0 = Float32[0.0; 2.0]
 du0 = Float32[0.0; 0.0]
@@ -61,7 +62,7 @@ callback = function (state, l)
     l < 0.01
 end
 
-adtype = OPT.AutoZygote()
+adtype = OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
 optf = OPT.OptimizationFunction((x, p) -> loss_n_ode(x), adtype)
 optprob = OPT.OptimizationProblem(optf, ps)
 

diff --git a/docs/src/examples/optimal_control/feedback_control.md b/docs/src/examples/optimal_control/feedback_control.md
@@ -4,6 +4,7 @@ You can also mix a known differential equation and a neural differential
 equation, so that the parameters and the neural network are estimated
 simultaneously!
 
+
 We will assume that we know the dynamics of the second equation
 (linear dynamics), and our goal is to find a neural network that is dependent
 on the current state of the dynamical system that will control the second

diff --git a/docs/src/examples/optimal_control/optimal_control.md b/docs/src/examples/optimal_control/optimal_control.md
@@ -43,7 +43,6 @@ import Optimization as OPT
 import OptimizationOptimJL as OOJ
 import OptimizationOptimisers as OPO
 import SciMLSensitivity as SMS
-import Zygote
 import Plots
 import Statistics
 import Random

diff --git a/docs/src/examples/pde/brusselator.md b/docs/src/examples/pde/brusselator.md
@@ -156,7 +156,7 @@ First, we have to define and configure the neural network that has to be used fo
 
 ```@example bruss
 import Lux, Random, Optimization as OPT, OptimizationOptimJL as OOJ,
-       SciMLSensitivity as SMS, Zygote
+       SciMLSensitivity as SMS, Mooncake
 
 model = Lux.Chain(Lux.Dense(2 => 16, tanh), Lux.Dense(16 => 1))
 rng = Random.default_rng()
@@ -223,12 +223,13 @@ function loss_fn(ps, _)
 end
 ```
 
-Once the loss function is defined, we use the ADAM optimizer to train the neural network. The optimization problem is defined using SciML's `Optimization.jl` tools, and gradients are computed via automatic differentiation using `AutoZygote()` from `SciMLSensitivity`:
+Once the loss function is defined, we use the ADAM optimizer to train the neural network. The optimization problem is defined using SciML's `Optimization.jl` tools, and gradients are computed via automatic differentiation using Mooncake through the `SciMLSensitivity` adjoint chain:
 
 ```@example bruss
 println("[Training] Starting optimization...")
 import OptimizationOptimisers as OPO
-optf = OPT.OptimizationFunction(loss_fn, SMS.AutoZygote())
+adtype = OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
+optf = OPT.OptimizationFunction(loss_fn, adtype)
 optprob = OPT.OptimizationProblem(optf, ps_init)
 loss_history = Float32[]
 

diff --git a/docs/src/examples/pde/pde_constrained.md b/docs/src/examples/pde/pde_constrained.md
@@ -6,7 +6,7 @@ This example uses a prediction model to optimize the one-dimensional Heat Equati
 ```@example pde
 import SciMLSensitivity as SMS
 import DelimitedFiles, Plots
-import OrdinaryDiffEq as ODE, Optimization as OPT, OptimizationPolyalgorithms as OPA, Zygote
+import OrdinaryDiffEq as ODE, Optimization as OPT, OptimizationPolyalgorithms as OPA, Mooncake
 
 # Problem setup parameters:
 Lx = 10.0
@@ -92,7 +92,7 @@ cb((; u = ps), loss(ps)) # Testing callback function
 Plots.scatter(sol[:, end], label = "Truth", size = (800, 500))
 Plots.plot!(PRED[end][:, end], lw = 2, label = "Prediction")
 
-adtype = OPT.AutoZygote()
+adtype = OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
 optf = OPT.OptimizationFunction((x, p) -> loss(x), adtype)
 
 optprob = OPT.OptimizationProblem(optf, ps)
@@ -107,7 +107,7 @@ res = OPT.solve(optprob, OPA.PolyOpt(), callback = cb)
 ```@example pde2
 import SciMLSensitivity as SMS
 import DelimitedFiles, Plots
-import OrdinaryDiffEq as ODE, Optimization as OPT, OptimizationPolyalgorithms as OPA, Zygote
+import OrdinaryDiffEq as ODE, Optimization as OPT, OptimizationPolyalgorithms as OPA, Mooncake
 ```
 
 ### Parameters
@@ -283,7 +283,7 @@ The resulting best parameters are stored in `res` and `res.u` returns the
 parameters that minimize the cost function.
 
 ```@example pde2
-adtype = OPT.AutoZygote()
+adtype = OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
 optf = OPT.OptimizationFunction((x, p) -> loss(x), adtype)
 
 optprob = OPT.OptimizationProblem(optf, ps)

diff --git a/docs/src/examples/sde/SDE_control.md b/docs/src/examples/sde/SDE_control.md
@@ -18,6 +18,7 @@ to ultimately prepare and stabilize the qubit in the excited state.
 Before getting to the explanation, here's some code to start with. We will
 follow a full explanation of the definition and training process:
 
+
 ```@example
 # load packages
 import SciMLSensitivity as SMS, Optimization as OPT, OptimizationOptimisers as OPO

diff --git a/docs/src/examples/sde/optimization_sde.md b/docs/src/examples/sde/optimization_sde.md
@@ -94,7 +94,8 @@ function cb2(st, l)
 end
 ```
 
-We can then use `Optimization.solve` to fit the SDE:
+We can then use `Optimization.solve` to fit the SDE.
+
 
 ```@example sde
 import Optimization as OPT, Zygote, OptimizationOptimisers as OPO
@@ -178,7 +179,8 @@ end
 Let's optimize
 
 ```@example sde
-adtype = OPT.AutoZygote()
+import Mooncake
+adtype = OPT.AutoMooncake(; config = Mooncake.Config(; friendly_tangents = true))
 optf = OPT.OptimizationFunction((x, p) -> loss_sde(x), adtype)
 
 optprob = OPT.OptimizationProblem(optf, p)

diff --git a/docs/src/faq.md b/docs/src/faq.md
@@ -48,17 +48,20 @@ Enzyme.autodiff(Enzyme.Reverse, Enzyme.Duplicated(_f, _tmp6),
 This is exactly the inner core Enzyme call and if this fails, that is the issue that
 needs to be fixed.
 
-And similarly, for out-of-place functions the Zygote isolation is as follows:
+And similarly, for out-of-place functions the Mooncake isolation is as follows:
 
 ```julia
+import Mooncake
 p = prob.p
 y = prob.u0
 f = prob.f
 λ = zero(prob.u0)
-_dy, back = Zygote.pullback(y, p) do u, p
-    vec(f(u, p, t))
-end
-tmp1, tmp2 = back(λ)
+# Build the Mooncake pullback for the inner-rhs evaluation `f(u, p)` and
+# apply the cotangent `λ` to recover the seed gradients `tmp1` (wrt `y`)
+# and `tmp2` (wrt `p`).
+g = (u, p) -> vec(f(u, p, t))
+cache = Mooncake.prepare_pullback_cache(g, y, p)
+_dy, (_, tmp1, tmp2) = Mooncake.value_and_pullback!!(cache, λ, g, y, p)
 ```
 
 ## How do I use custom parameter types with adjoint sensitivity analysis?