From c84bcecee73a8a735d8918e627b0959dc93d939a Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 10 Dec 2025 17:46:41 +0100
Subject: [PATCH 1/5] Remove using namespace amrex from HpMultiGrid.cpp

---
 src/mg_solver/HpMultiGrid.cpp | 542 ++++++++++++++++++----------------
 1 file changed, 289 insertions(+), 253 deletions(-)
diff --git a/src/mg_solver/HpMultiGrid.cpp b/src/mg_solver/HpMultiGrid.cpp
index ba9a52d9de..f6c6bbb3dc 100644
--- a/src/mg_solver/HpMultiGrid.cpp
+++ b/src/mg_solver/HpMultiGrid.cpp
@@ -9,8 +9,6 @@
 #include "utils/GPUUtil.H"
 #include <algorithm>
 
-using namespace amrex;
-
 namespace hpmg {
 
 namespace {
@@ -19,9 +17,9 @@ namespace {
     constexpr int n_cell_single = 32; // switch to single block when box is smaller than this
 #endif
 
-Box valid_domain_box (Box const& domain)
+amrex::Box valid_domain_box (amrex::Box const& domain)
 {
-    return domain.cellCentered() ? domain : amrex::grow(domain, IntVect(-1,-1,0));
+    return domain.cellCentered() ? domain : amrex::grow(domain, amrex::IntVect(-1,-1,0));
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -30,9 +28,9 @@ Box valid_domain_box (Box const& domain)
 
 template <typename T, typename U>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void restrict_cc (int i, int j, int n, Array4<T> const& crse, Array4<U> const& fine)
+void restrict_cc (int i, int j, int n, amrex::Array4<T> const& crse, amrex::Array4<U> const& fine)
 {
-    crse(i,j,0,n) = Real(0.25)*(fine(2*i  ,2*j  ,0,n) +
+    crse(i,j,0,n) = amrex::Real(0.25)*(fine(2*i  ,2*j  ,0,n) +
                                 fine(2*i+1,2*j  ,0,n) +
                                 fine(2*i  ,2*j+1,0,n) +
                                 fine(2*i+1,2*j+1,0,n));
@@ -40,22 +38,22 @@ void restrict_cc (int i, int j, int n, Array4<T> const& crse, Array4<U> const& f
 
 template <typename T, typename U>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void restrict_nd (int i, int j, int n, Array4<T> const& crse, Array4<U> const& fine)
+void restrict_nd (int i, int j, int n, amrex::Array4<T> const& crse, amrex::Array4<U> const& fine)
 {
-    crse(i,j,0,n) = Real(1./16.) * (fine(2*i-1,2*j-1,0,n) +
-                           Real(2.)*fine(2*i  ,2*j-1,0,n) +
+    crse(i,j,0,n) = amrex::Real(1./16.) * (fine(2*i-1,2*j-1,0,n) +
+                           amrex::Real(2.)*fine(2*i  ,2*j-1,0,n) +
                                     fine(2*i+1,2*j-1,0,n) +
-                           Real(2.)*fine(2*i-1,2*j  ,0,n) +
-                           Real(4.)*fine(2*i  ,2*j  ,0,n) +
-                           Real(2.)*fine(2*i+1,2*j  ,0,n) +
+                           amrex::Real(2.)*fine(2*i-1,2*j  ,0,n) +
+                           amrex::Real(4.)*fine(2*i  ,2*j  ,0,n) +
+                           amrex::Real(2.)*fine(2*i+1,2*j  ,0,n) +
                                     fine(2*i-1,2*j+1,0,n) +
-                           Real(2.)*fine(2*i  ,2*j+1,0,n) +
+                           amrex::Real(2.)*fine(2*i  ,2*j+1,0,n) +
                                     fine(2*i+1,2*j+1,0,n));
 }
 
 template <typename T, typename U>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void interpadd_cc (int i, int j, int n, Array4<T> const& fine, Array4<U> const& crse)
+void interpadd_cc (int i, int j, int n, amrex::Array4<T> const& fine, amrex::Array4<U> const& crse)
 {
     int ic = amrex::coarsen(i,2);
     int jc = amrex::coarsen(j,2);
@@ -64,7 +62,7 @@ void interpadd_cc (int i, int j, int n, Array4<T> const& fine, Array4<U> const&
 
 template <typename T, typename U>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void interpadd_nd (int i, int j, int n, Array4<T> const& fine, Array4<U> const& crse)
+void interpadd_nd (int i, int j, int n, amrex::Array4<T> const& fine, amrex::Array4<U> const& crse)
 {
     int ic = amrex::coarsen(i,2);
     int jc = amrex::coarsen(j,2);
@@ -74,13 +72,13 @@ void interpadd_nd (int i, int j, int n, Array4<T> const& fine, Array4<U> const&
         fine(i,j,0,n) += (crse(ic  ,jc  ,0,n) +
                           crse(ic+1,jc  ,0,n) +
                           crse(ic  ,jc+1,0,n) +
-                          crse(ic+1,jc+1,0,n))*Real(0.25);
+                          crse(ic+1,jc+1,0,n))*amrex::Real(0.25);
     } else if (i_is_odd) {
         fine(i,j,0,n) += (crse(ic  ,jc,0,n) +
-                          crse(ic+1,jc,0,n))*Real(0.5);
+                          crse(ic+1,jc,0,n))*amrex::Real(0.5);
     } else if (j_is_odd) {
         fine(i,j,0,n) += (crse(ic,jc  ,0,n) +
-                          crse(ic,jc+1,0,n))*Real(0.5);
+                          crse(ic,jc+1,0,n))*amrex::Real(0.5);
     } else {
         fine(i,j,0,n) += crse(ic,jc,0,n);
     }
@@ -89,8 +87,8 @@ void interpadd_nd (int i, int j, int n, Array4<T> const& fine, Array4<U> const&
 // out of place version used before shared memory gsrb
 template <typename T, typename U, typename V>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void interpcpy_cc (int i, int j, int n, Array4<T> const& fine_in, Array4<U> const& crse,
-                Array4<V> const& fine_out)
+void interpcpy_cc (int i, int j, int n, amrex::Array4<T> const& fine_in,
+                   amrex::Array4<U> const& crse, amrex::Array4<V> const& fine_out)
 {
     int ic = amrex::coarsen(i,2);
     int jc = amrex::coarsen(j,2);
@@ -99,8 +97,8 @@ void interpcpy_cc (int i, int j, int n, Array4<T> const& fine_in, Array4<U> cons
 
 template <typename T, typename U, typename V>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void interpcpy_nd (int i, int j, int n, Array4<T> const& fine_in, Array4<U> const& crse,
-                Array4<V> const& fine_out)
+void interpcpy_nd (int i, int j, int n, amrex::Array4<T> const& fine_in,
+                   amrex::Array4<U> const& crse, amrex::Array4<V> const& fine_out)
 {
     int ic = amrex::coarsen(i,2);
     int jc = amrex::coarsen(j,2);
@@ -110,20 +108,20 @@ void interpcpy_nd (int i, int j, int n, Array4<T> const& fine_in, Array4<U> cons
         fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic  ,jc  ,0,n) +
                                                 crse(ic+1,jc  ,0,n) +
                                                 crse(ic  ,jc+1,0,n) +
-                                                crse(ic+1,jc+1,0,n))*Real(0.25);
+                                                crse(ic+1,jc+1,0,n))*amrex::Real(0.25);
     } else if (i_is_odd) {
         fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic  ,jc,0,n) +
-                                                crse(ic+1,jc,0,n))*Real(0.5);
+                                                crse(ic+1,jc,0,n))*amrex::Real(0.5);
     } else if (j_is_odd) {
         fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic,jc  ,0,n) +
-                                                crse(ic,jc+1,0,n))*Real(0.5);
+                                                crse(ic,jc+1,0,n))*amrex::Real(0.5);
     } else {
         fine_out(i,j,0,n) = fine_in(i,j,0,n) +  crse(ic,jc,0,n);
     }
 }
 
-void restriction (Box const& box, Array4<Real> const& crse, Array4<Real const> const& fine,
-                  int num_comps)
+void restriction (amrex::Box const& box, amrex::Array4<amrex::Real> const& crse,
+                  amrex::Array4<amrex::Real const> const& fine, int num_comps)
 {
     if (box.cellCentered()) {
         hpmg::ParallelFor(to2D(box), num_comps, [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept
@@ -139,9 +137,10 @@ void restriction (Box const& box, Array4<Real> const& crse, Array4<Real const> c
     }
 }
 
-void interpolation_outofplace (Box const& box, Array4<Real const> const& fine_in,
-                               Array4<Real const> const& crse, Array4<Real> const& fine_out,
-                               int num_comps)
+void interpolation_outofplace (amrex::Box const& box,
+                               amrex::Array4<amrex::Real const> const& fine_in,
+                               amrex::Array4<amrex::Real const> const& crse,
+                               amrex::Array4<amrex::Real> const& fine_out, int num_comps)
 {
     if (box.cellCentered()) {
         hpmg::ParallelFor(to2D(box), num_comps, [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept
@@ -162,21 +161,21 @@ void interpolation_outofplace (Box const& box, Array4<Real const> const& fine_in
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-Real laplacian (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
-                Array4<Real> const& phi, Real facx, Real facy)
+amrex::Real laplacian (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
+                amrex::Array4<amrex::Real> const& phi, amrex::Real facx, amrex::Real facy)
 {
-    Real lap = Real(-2.)*(facx+facy)*phi(i,j,0,n);
+    amrex::Real lap = amrex::Real(-2.)*(facx+facy)*phi(i,j,0,n);
     if (i == ilo) {
-        lap += facx * (Real(4./3.)*phi(i+1,j,0,n) - Real(2.)*phi(i,j,0,n));
+        lap += facx * (amrex::Real(4./3.)*phi(i+1,j,0,n) - amrex::Real(2.)*phi(i,j,0,n));
     } else if (i == ihi) {
-        lap += facx * (Real(4./3.)*phi(i-1,j,0,n) - Real(2.)*phi(i,j,0,n));
+        lap += facx * (amrex::Real(4./3.)*phi(i-1,j,0,n) - amrex::Real(2.)*phi(i,j,0,n));
     } else {
         lap += facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n));
     }
     if (j == jlo) {
-        lap += facy * (Real(4./3.)*phi(i,j+1,0,n) - Real(2.)*phi(i,j,0,n));
+        lap += facy * (amrex::Real(4./3.)*phi(i,j+1,0,n) - amrex::Real(2.)*phi(i,j,0,n));
     } else if (j == jhi) {
-        lap += facy * (Real(4./3.)*phi(i,j-1,0,n) - Real(2.)*phi(i,j,0,n));
+        lap += facy * (amrex::Real(4./3.)*phi(i,j-1,0,n) - amrex::Real(2.)*phi(i,j,0,n));
     } else {
         lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n));
     }
@@ -184,52 +183,57 @@ Real laplacian (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-Real residual1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
-                Array4<Real> const& phi, Real rhs, Real acf, Real facx, Real facy)
+amrex::Real residual1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
+                       amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf,
+                       amrex::Real facx, amrex::Real facy)
 {
-    Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy);
+    amrex::Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy);
     return rhs + acf*phi(i,j,0,n) - lap;
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-Real residual2r (int i, int j, int ilo, int jlo, int ihi, int jhi,
-                 Array4<Real> const& phi, Real rhs, Real acf_r, Real acf_i,
-                 Real facx, Real facy)
+amrex::Real residual2r (int i, int j, int ilo, int jlo, int ihi, int jhi,
+                        amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf_r,
+                        amrex::Real acf_i,
+                        amrex::Real facx, amrex::Real facy)
 {
-    Real lap = laplacian(i,j,0,ilo,jlo,ihi,jhi,phi,facx,facy);
+    amrex::Real lap = laplacian(i,j,0,ilo,jlo,ihi,jhi,phi,facx,facy);
     return rhs + acf_r*phi(i,j,0,0) - acf_i*phi(i,j,0,1) - lap;
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-Real residual2i (int i, int j, int ilo, int jlo, int ihi, int jhi,
-                 Array4<Real> const& phi, Real rhs, Real acf_r, Real acf_i,
-                 Real facx, Real facy)
+amrex::Real residual2i (int i, int j, int ilo, int jlo, int ihi, int jhi,
+                        amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf_r,
+                        amrex::Real acf_i,
+                        amrex::Real facx, amrex::Real facy)
 {
-    Real lap = laplacian(i,j,1,ilo,jlo,ihi,jhi,phi,facx,facy);
+    amrex::Real lap = laplacian(i,j,1,ilo,jlo,ihi,jhi,phi,facx,facy);
     return rhs + acf_i*phi(i,j,0,0) + acf_r*phi(i,j,0,1) - lap;
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-Real residual3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
-                Array4<Real> const& phi, Real rhs, Real facx, Real facy)
+amrex::Real residual3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
+                       amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real facx,
+                       amrex::Real facy)
 {
-    Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy);
+    amrex::Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy);
     return rhs - lap;
 }
 
 #if !defined(AMREX_USE_CUDA) && !defined(AMREX_USE_HIP) && defined(AMREX_USE_GPU)
 // res = rhs - L(phi)
-void compute_residual (Box const& box, Array4<Real> const& res,
-                       Array4<Real> const& phi, Array4<Real const> const& rhs,
-                       Array4<Real const> const& acf, Real dx, Real dy,
+void compute_residual (amrex::Box const& box, amrex::Array4<amrex::Real> const& res,
+                       amrex::Array4<amrex::Real> const& phi,
+                       amrex::Array4<amrex::Real const> const& rhs,
+                       amrex::Array4<amrex::Real const> const& acf, amrex::Real dx, amrex::Real dy,
                        int system_type)
 {
     int const ilo = box.smallEnd(0);
     int const jlo = box.smallEnd(1);
     int const ihi = box.bigEnd(0);
     int const jhi = box.bigEnd(1);
-    Real facx = Real(1.)/(dx*dx);
-    Real facy = Real(1.)/(dy*dy);
+    amrex::Real facx = amrex::Real(1.)/(dx*dx);
+    amrex::Real facy = amrex::Real(1.)/(dy*dy);
     if (system_type == 1) {
         hpmg::ParallelFor(to2D(valid_domain_box(box)),
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
@@ -267,29 +271,30 @@ void compute_residual (Box const& box, Array4<Real> const& res,
 template<bool is_cell_centered = true>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void gs1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
-          Array4<Real> const& phi, Real rhs, Real acf, Real facx, Real facy)
+          amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf, amrex::Real facx,
+          amrex::Real facy)
 {
-    Real lap;
-    Real c0 = -(acf+Real(2.)*(facx+facy));
+    amrex::Real lap;
+    amrex::Real c0 = -(acf+amrex::Real(2.)*(facx+facy));
     if (is_cell_centered && i == ilo) {
-        lap = facx * Real(4./3.)*phi(i+1,j,0,n);
-        c0 -= Real(2.)*facx;
+        lap = facx * amrex::Real(4./3.)*phi(i+1,j,0,n);
+        c0 -= amrex::Real(2.)*facx;
     } else if (is_cell_centered && i == ihi) {
-        lap = facx * Real(4./3.)*phi(i-1,j,0,n);
-        c0 -= Real(2.)*facx;
+        lap = facx * amrex::Real(4./3.)*phi(i-1,j,0,n);
+        c0 -= amrex::Real(2.)*facx;
     } else {
         lap = facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n));
     }
     if (is_cell_centered && j == jlo) {
-        lap += facy * Real(4./3.)*phi(i,j+1,0,n);
-        c0 -= Real(2.)*facy;
+        lap += facy * amrex::Real(4./3.)*phi(i,j+1,0,n);
+        c0 -= amrex::Real(2.)*facy;
     } else if (is_cell_centered && j == jhi) {
-        lap += facy * Real(4./3.)*phi(i,j-1,0,n);
-        c0 -= Real(2.)*facy;
+        lap += facy * amrex::Real(4./3.)*phi(i,j-1,0,n);
+        c0 -= amrex::Real(2.)*facy;
     } else {
         lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n));
     }
-    const Real c0_inv = Real(1.) / c0;
+    const amrex::Real c0_inv = amrex::Real(1.) / c0;
     phi(i,j,0,n) = (rhs - lap) * c0_inv;
 }
 
@@ -298,37 +303,37 @@ void gs1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
 template<bool is_cell_centered = true>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void gs2 (int i, int j, int ilo, int jlo, int ihi, int jhi,
-          Array4<Real> const& phi, Real rhs_r, Real rhs_i,
-          Real ar, Real ai, Real facx, Real facy)
+          amrex::Array4<amrex::Real> const& phi, amrex::Real rhs_r, amrex::Real rhs_i,
+          amrex::Real ar, amrex::Real ai, amrex::Real facx, amrex::Real facy)
 {
-    Real lap[2];
-    Real c0 = Real(-2.)*(facx+facy);
+    amrex::Real lap[2];
+    amrex::Real c0 = amrex::Real(-2.)*(facx+facy);
     if (is_cell_centered && i == ilo) {
-        lap[0] = facx * Real(4./3.)*phi(i+1,j,0,0);
-        lap[1] = facx * Real(4./3.)*phi(i+1,j,0,1);
-        c0 -= Real(2.)*facx;
+        lap[0] = facx * amrex::Real(4./3.)*phi(i+1,j,0,0);
+        lap[1] = facx * amrex::Real(4./3.)*phi(i+1,j,0,1);
+        c0 -= amrex::Real(2.)*facx;
     } else if (is_cell_centered && i == ihi) {
-        lap[0] = facx * Real(4./3.)*phi(i-1,j,0,0);
-        lap[1] = facx * Real(4./3.)*phi(i-1,j,0,1);
-        c0 -= Real(2.)*facx;
+        lap[0] = facx * amrex::Real(4./3.)*phi(i-1,j,0,0);
+        lap[1] = facx * amrex::Real(4./3.)*phi(i-1,j,0,1);
+        c0 -= amrex::Real(2.)*facx;
     } else {
         lap[0] = facx * (phi(i-1,j,0,0) + phi(i+1,j,0,0));
         lap[1] = facx * (phi(i-1,j,0,1) + phi(i+1,j,0,1));
     }
     if (is_cell_centered && j == jlo) {
-        lap[0] += facy * Real(4./3.)*phi(i,j+1,0,0);
-        lap[1] += facy * Real(4./3.)*phi(i,j+1,0,1);
-        c0 -= Real(2.)*facy;
+        lap[0] += facy * amrex::Real(4./3.)*phi(i,j+1,0,0);
+        lap[1] += facy * amrex::Real(4./3.)*phi(i,j+1,0,1);
+        c0 -= amrex::Real(2.)*facy;
     } else if (is_cell_centered && j == jhi) {
-        lap[0] += facy * Real(4./3.)*phi(i,j-1,0,0);
-        lap[1] += facy * Real(4./3.)*phi(i,j-1,0,1);
-        c0 -= Real(2.)*facy;
+        lap[0] += facy * amrex::Real(4./3.)*phi(i,j-1,0,0);
+        lap[1] += facy * amrex::Real(4./3.)*phi(i,j-1,0,1);
+        c0 -= amrex::Real(2.)*facy;
     } else {
         lap[0] += facy * (phi(i,j-1,0,0) + phi(i,j+1,0,0));
         lap[1] += facy * (phi(i,j-1,0,1) + phi(i,j+1,0,1));
     }
-    Real c[2] = {c0-ar, -ai};
-    Real cmag = Real(1.)/(c[0]*c[0] + c[1]*c[1]);
+    amrex::Real c[2] = {c0-ar, -ai};
+    amrex::Real cmag = amrex::Real(1.)/(c[0]*c[0] + c[1]*c[1]);
     c[0] *= cmag;
     c[1] *= cmag;
     phi(i,j,0,0) = (rhs_r-lap[0])*c[0] + (rhs_i-lap[1])*c[1];
@@ -340,42 +345,43 @@ void gs2 (int i, int j, int ilo, int jlo, int ihi, int jhi,
 template<bool is_cell_centered = true>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void gs3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
-          Array4<Real> const& phi, Real rhs, Real facx, Real facy)
+          amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real facx,
+          amrex::Real facy)
 {
-    Real lap;
-    Real c0 = -Real(2.)*(facx+facy);
+    amrex::Real lap;
+    amrex::Real c0 = -amrex::Real(2.)*(facx+facy);
     if (is_cell_centered && i == ilo) {
-        lap = facx * Real(4./3.)*phi(i+1,j,0,n);
-        c0 -= Real(2.)*facx;
+        lap = facx * amrex::Real(4./3.)*phi(i+1,j,0,n);
+        c0 -= amrex::Real(2.)*facx;
     } else if (is_cell_centered && i == ihi) {
-        lap = facx * Real(4./3.)*phi(i-1,j,0,n);
-        c0 -= Real(2.)*facx;
+        lap = facx * amrex::Real(4./3.)*phi(i-1,j,0,n);
+        c0 -= amrex::Real(2.)*facx;
     } else {
         lap = facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n));
     }
     if (is_cell_centered && j == jlo) {
-        lap += facy * Real(4./3.)*phi(i,j+1,0,n);
-        c0 -= Real(2.)*facy;
+        lap += facy * amrex::Real(4./3.)*phi(i,j+1,0,n);
+        c0 -= amrex::Real(2.)*facy;
     } else if (is_cell_centered && j == jhi) {
-        lap += facy * Real(4./3.)*phi(i,j-1,0,n);
-        c0 -= Real(2.)*facy;
+        lap += facy * amrex::Real(4./3.)*phi(i,j-1,0,n);
+        c0 -= amrex::Real(2.)*facy;
     } else {
         lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n));
     }
-    const Real c0_inv = Real(1.) / c0;
+    const amrex::Real c0_inv = amrex::Real(1.) / c0;
     phi(i,j,0,n) = (rhs - lap) * c0_inv;
 }
 
-void gsrb (int icolor, Box const& box, Array4<Real> const& phi,
-           Array4<Real const> const& rhs, Array4<Real const> const& acf,
-           Real dx, Real dy, int system_type)
+void gsrb (int icolor, amrex::Box const& box, amrex::Array4<amrex::Real> const& phi,
+           amrex::Array4<amrex::Real const> const& rhs, amrex::Array4<amrex::Real const> const& acf,
+           amrex::Real dx, amrex::Real dy, int system_type)
 {
     int const ilo = box.smallEnd(0);
     int const jlo = box.smallEnd(1);
     int const ihi = box.bigEnd(0);
     int const jhi = box.bigEnd(1);
-    Real facx = Real(1.)/(dx*dx);
-    Real facy = Real(1.)/(dy*dy);
+    amrex::Real facx = amrex::Real(1.)/(dx*dx);
+    amrex::Real facy = amrex::Real(1.)/(dy*dy);
     if (system_type == 1) {
         hpmg::ParallelFor(to2D(valid_domain_box(box)),
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
@@ -413,9 +419,10 @@ void gsrb (int icolor, Box const& box, Array4<Real> const& phi,
 
 // do multiple gsrb iterations in GPU shared memory with many ghost cells
 template<int system_type, bool zero_init, bool do_compute_residual, bool is_cell_centered>
-void gsrb_shared (Box const& box, Array4<Real> const& phi_out, Array4<Real const> const& rhs,
-                  Array4<Real const> const& acf, Array4<Real> const& res,
-                  Array4<Real const> const& phi_in, Real dx, Real dy)
+void gsrb_shared (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_out,
+                  amrex::Array4<amrex::Real const> const& rhs,
+                  amrex::Array4<amrex::Real const> const& acf, amrex::Array4<amrex::Real> const& res,
+                  amrex::Array4<amrex::Real const> const& phi_in, amrex::Real dx, amrex::Real dy)
 {
     constexpr int num_comps = MultiGrid::get_num_comps(system_type);
     constexpr int num_comps_acf = MultiGrid::get_num_comps_acf(system_type);
@@ -437,11 +444,11 @@ void gsrb_shared (Box const& box, Array4<Real> const& phi_out, Array4<Real const
     int const jlo = box.smallEnd(1);
     int const ihi = box.bigEnd(0);
     int const jhi = box.bigEnd(1);
-    Real facx = Real(1.)/(dx*dx);
-    Real facy = Real(1.)/(dy*dy);
+    amrex::Real facx = amrex::Real(1.)/(dx*dx);
+    amrex::Real facy = amrex::Real(1.)/(dy*dy);
 
     // box for the bounds of the ParallelFor loop this kernel replaces
-    const Box loop_box = valid_domain_box(box);
+    const amrex::Box loop_box = valid_domain_box(box);
     int const ilo_loop = loop_box.smallEnd(0);
     int const jlo_loop = loop_box.smallEnd(1);
     int const ihi_loop = loop_box.bigEnd(0);
@@ -454,7 +461,7 @@ void gsrb_shared (Box const& box, Array4<Real> const& phi_out, Array4<Real const
         [=] AMREX_GPU_DEVICE() noexcept
         {
             // allocate static shared memory
-            __shared__ Real phi_ptr[num_cells_in_tile];
+            __shared__ amrex::Real phi_ptr[num_cells_in_tile];
 
             std::uint64_t remainder = 0;
             const int iblock_y = num_blocks_divmod.divmod(remainder, blockIdx.x);
@@ -466,14 +473,14 @@ void gsrb_shared (Box const& box, Array4<Real> const& phi_out, Array4<Real const
             const int tile_end_x = tile_begin_x + tilesize_array_x;
             const int tile_end_y = tile_begin_y + tilesize_array_y;
 
-            // make Array4 reference shared memory tile
-            Array4<Real> phi_shared(phi_ptr, {tile_begin_x, tile_begin_y, 0},
+            // make amrex::Array4 reference shared memory tile
+            amrex::Array4<amrex::Real> phi_shared(phi_ptr, {tile_begin_x, tile_begin_y, 0},
                                              {tile_end_x, tile_end_y, 1}, num_comps);
 
             if (zero_init) {
                 // initialize shared memory to zero
                 for (int s = threadIdx.x; s < num_cells_in_tile; s+=blockDim.x) {
-                    phi_ptr[s] = Real(0.);
+                    phi_ptr[s] = amrex::Real(0.);
                 }
             } else {
                 // initialize shared memory to phi_in inside the domain, outside zero
@@ -489,7 +496,7 @@ void gsrb_shared (Box const& box, Array4<Real> const& phi_out, Array4<Real const
                         }
                     } else {
                         for (int n=0; n<num_comps; ++n) {
-                            phi_shared(sx, sy, 0, n) = Real(0.);
+                            phi_shared(sx, sy, 0, n) = amrex::Real(0.);
                         }
                     }
                 }
@@ -502,8 +509,8 @@ void gsrb_shared (Box const& box, Array4<Real> const& phi_out, Array4<Real const
             const int i = tile_begin_x + 1 + ithread_x;
             const int j = tile_begin_y + 1 + ithread_y;
 
-            amrex::GpuArray<Real, num_comps> rhs_num[2] = {{}, {}};
-            amrex::GpuArray<Real, num_comps_acf> acf_num[2] = {{}, {}};
+            amrex::GpuArray<amrex::Real, num_comps> rhs_num[2] = {{}, {}};
+            amrex::GpuArray<amrex::Real, num_comps_acf> acf_num[2] = {{}, {}};
 
             for (int nj=0; nj<=1; ++nj) {
                 if (ilo_loop <= i && i <= ihi_loop &&
@@ -595,9 +602,11 @@ void gsrb_shared (Box const& box, Array4<Real> const& phi_out, Array4<Real const
 
 // do multiple gsrb iterations in CPU cached memory with many ghost cells
 template<int system_type, bool zero_init, bool do_compute_residual, bool is_cell_centered>
-void gsrb_cached (Box const& box, Array4<Real> const& phi_out, Array4<Real const> const& rhs,
-                  Array4<Real const> const& acf, Array4<Real> const& res,
-                  Array4<Real const> const& phi_in, Real dx, Real dy)
+void gsrb_cached (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_out,
+                  amrex::Array4<amrex::Real const> const& rhs,
+                  amrex::Array4<amrex::Real const> const& acf,
+                  amrex::Array4<amrex::Real> const& res,
+                  amrex::Array4<amrex::Real const> const& phi_in, amrex::Real dx, amrex::Real dy)
 {
     constexpr int num_comps = MultiGrid::get_num_comps(system_type);
     constexpr int tilesize_x = 64;
@@ -616,11 +625,11 @@ void gsrb_cached (Box const& box, Array4<Real> const& phi_out, Array4<Real const
     int const jlo = box.smallEnd(1);
     int const ihi = box.bigEnd(0);
     int const jhi = box.bigEnd(1);
-    const Real facx = Real(1.)/(dx*dx);
-    const Real facy = Real(1.)/(dy*dy);
+    const amrex::Real facx = amrex::Real(1.)/(dx*dx);
+    const amrex::Real facy = amrex::Real(1.)/(dy*dy);
 
     // box for the bounds of the ParallelFor loop this kernel replaces
-    const Box loop_box = valid_domain_box(box);
+    const amrex::Box loop_box = valid_domain_box(box);
     int const ilo_loop = loop_box.smallEnd(0);
     int const jlo_loop = loop_box.smallEnd(1);
     int const ihi_loop = loop_box.bigEnd(0);
@@ -635,7 +644,7 @@ void gsrb_cached (Box const& box, Array4<Real> const& phi_out, Array4<Real const
         for (int iblock_x=0; iblock_x<num_blocks_x; ++iblock_x) {
 
             // allocate cached memory
-            Real phi_ptr[tilesize_array_x * tilesize_array_y * num_comps];
+            amrex::Real phi_ptr[tilesize_array_x * tilesize_array_y * num_comps];
 
             const int tile_begin_x = iblock_x * final_tilesize_x - edge_offset - 1 + ilo_loop;
             const int tile_begin_y = iblock_y * final_tilesize_y - edge_offset - 1 + jlo_loop;
@@ -643,14 +652,14 @@ void gsrb_cached (Box const& box, Array4<Real> const& phi_out, Array4<Real const
             const int tile_end_x = tile_begin_x + tilesize_array_x;
             const int tile_end_y = tile_begin_y + tilesize_array_y;
 
-            // make Array4 reference cached memory tile
-            Array4<Real> phi_cached(phi_ptr, {tile_begin_x, tile_begin_y, 0},
+            // make amrex::Array4 reference cached memory tile
+            amrex::Array4<amrex::Real> phi_cached(phi_ptr, {tile_begin_x, tile_begin_y, 0},
                                              {tile_end_x, tile_end_y, 1}, num_comps);
 
             if  (zero_init) {
                 // initialize cached memory to zero
                 for (int s = 0; s < tilesize_array_x * tilesize_array_y * num_comps; ++s) {
-                    phi_ptr[s] = Real(0.);
+                    phi_ptr[s] = amrex::Real(0.);
                 }
             } else {
                 // initialize cached memory to phi_in inside the domain, outside zero
@@ -663,7 +672,7 @@ void gsrb_cached (Box const& box, Array4<Real> const& phi_out, Array4<Real const
                             }
                         } else {
                             for (int n=0; n<num_comps; ++n) {
-                                phi_cached(i, j, 0, n) = Real(0.);
+                                phi_cached(i, j, 0, n) = amrex::Real(0.);
                             }
                         }
                     }
@@ -744,13 +753,13 @@ void gsrb_cached (Box const& box, Array4<Real> const& phi_out, Array4<Real const
 #endif
 
 template<bool zero_init, bool do_compute_residual>
-void gsrb_4_residual (int system_type, Box const& box,
-                      Array4<Real> const& phi_out,
-                      Array4<Real const> const& rhs,
-                      Array4<Real const> const& acf,
-                      Array4<Real> const& res,
-                      Array4<Real const> const& phi_in,
-                      Real dx, Real dy)
+void gsrb_4_residual (int system_type, amrex::Box const& box,
+                      amrex::Array4<amrex::Real> const& phi_out,
+                      amrex::Array4<amrex::Real const> const& rhs,
+                      amrex::Array4<amrex::Real const> const& acf,
+                      amrex::Array4<amrex::Real> const& res,
+                      amrex::Array4<amrex::Real const> const& phi_in,
+                      amrex::Real dx, amrex::Real dy)
 {
     // This function performs the main computation for the multigrid solver.
     // For CUDA / HIP it is implemented to use a single GPU kernel with shared memory
@@ -816,9 +825,9 @@ void gsrb_4_residual (int system_type, Box const& box,
 
 #else
     if (zero_init) {
-        Real * pcor_out = phi_out.dataPtr();
+        amrex::Real * pcor_out = phi_out.dataPtr();
         hpmg::ParallelFor(box.numPts()*MultiGrid::get_num_comps(system_type),
-            [=] AMREX_GPU_DEVICE (Long i) noexcept { pcor_out[i] = Real(0.); });
+            [=] AMREX_GPU_DEVICE (Long i) noexcept { pcor_out[i] = amrex::Real(0.); });
     } else {
         const amrex::Box valid_domain = valid_domain_box(box);
         hpmg::ParallelFor(to2D(box), MultiGrid::get_num_comps(system_type),
@@ -826,7 +835,7 @@ void gsrb_4_residual (int system_type, Box const& box,
                 if (valid_domain.contains(i,j,0)) {
                     phi_out(i,j,0,n) = phi_in(i,j,0,n);
                 } else {
-                    phi_out(i,j,0,n) = Real(0.);
+                    phi_out(i,j,0,n) = amrex::Real(0.);
                 }
             });
     }
@@ -854,9 +863,9 @@ void gsrb_4_residual (int system_type, Box const& box,
 #endif
 
 template <int NS, int system_type, typename FGS, typename FRES>
-void bottomsolve_gpu (Real dx0, Real dy0, Array4<Real> const* acf,
-                      Array4<Real> const* res, Array4<Real> const* cor,
-                      Array4<Real> const* rescor, int nlevs, int corner_offset,
+void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4<amrex::Real> const* acf,
+                      amrex::Array4<amrex::Real> const* res, amrex::Array4<amrex::Real> const* cor,
+                      amrex::Array4<amrex::Real> const* rescor, int nlevs, int corner_offset,
                       FGS&& fgs, FRES&& fres)
 {
     // This function performs all the operations of a vcycle in a single GPU kernel by
@@ -867,15 +876,15 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4<Real> const* acf,
 
     static_assert(n_cell_single*n_cell_single <= 1024, "n_cell_single is too big");
 #if defined(AMREX_USE_DPCPP)
-    amrex::launch(1, 1024, Gpu::gpuStream(),
+    amrex::launch(1, 1024, amrex::Gpu::gpuStream(),
     [=] (sycl::nd_item<1> const& item) noexcept
 #else
-    amrex::launch_global<1024><<<1, 1024, 0, Gpu::gpuStream()>>>(
+    amrex::launch_global<1024><<<1, 1024, 0, amrex::Gpu::gpuStream()>>>(
     [=] AMREX_GPU_DEVICE () noexcept
 #endif
     {
-        Real facx = Real(1.)/(dx0*dx0);
-        Real facy = Real(1.)/(dy0*dy0);
+        amrex::Real facx = amrex::Real(1.)/(dx0*dx0);
+        amrex::Real facy = amrex::Real(1.)/(dy0*dy0);
         int lenx = cor[0].end.x - cor[0].begin.x - 2*corner_offset;
         int leny = cor[0].end.y - cor[0].begin.y - 2*corner_offset;
         int ncells = lenx*leny;
@@ -893,10 +902,10 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4<Real> const* acf,
             // set phi to zero
             if (icell < ncells) {
                 if (system_type == 1 || system_type == 2) {
-                    cor[ilev](i,j,0,0) = Real(0.);
-                    cor[ilev](i,j,0,1) = Real(0.);
+                    cor[ilev](i,j,0,0) = amrex::Real(0.);
+                    cor[ilev](i,j,0,1) = amrex::Real(0.);
                 } else {
-                    cor[ilev](i,j,0,0) = Real(0.);
+                    cor[ilev](i,j,0,0) = amrex::Real(0.);
                 }
             }
             HPMG_SYNCTHREADS;
@@ -954,8 +963,8 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4<Real> const* acf,
             }
             HPMG_SYNCTHREADS;
 
-            facx *= Real(0.25);
-            facy *= Real(0.25);
+            facx *= amrex::Real(0.25);
+            facy *= amrex::Real(0.25);
         }
 
         // bottom
@@ -964,10 +973,10 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4<Real> const* acf,
             const int ilev = nlevs-1;
             if (icell < ncells) {
                 if (system_type == 1 || system_type == 2) {
-                    cor[ilev](i,j,0,0) = Real(0.);
-                    cor[ilev](i,j,0,1) = Real(0.);
+                    cor[ilev](i,j,0,0) = amrex::Real(0.);
+                    cor[ilev](i,j,0,1) = amrex::Real(0.);
                 } else {
-                    cor[ilev](i,j,0,0) = Real(0.);
+                    cor[ilev](i,j,0,0) = amrex::Real(0.);
                 }
             }
             HPMG_SYNCTHREADS;
@@ -992,8 +1001,8 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4<Real> const* acf,
             lenx = cor[ilev].end.x - cor[ilev].begin.x - 2*corner_offset;
             leny = cor[ilev].end.y - cor[ilev].begin.y - 2*corner_offset;
             ncells = lenx*leny;
-            facx *= Real(4.);
-            facy *= Real(4.);
+            facx *= amrex::Real(4.);
+            facy *= amrex::Real(4.);
 
             if (icell < ncells) {
                 j = icell /   lenx;
@@ -1042,20 +1051,21 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4<Real> const* acf,
 // Initialization: /////////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type)
-    : m_system_type(a_system_type), m_dx(dx), m_dy(dy)
+MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a_system_type)
+    : m_system_type(a_system_type),
+      m_num_comps(get_num_comps(a_system_type)),
+      m_num_comps_acf(get_num_comps_acf(a_system_type)),
+      m_dx(dx),
+      m_dy(dy)
 {
-    m_num_comps = get_num_comps(m_system_type);
-    m_num_comps_acf = get_num_comps_acf(m_system_type);
-
-    IntVect const a_domain_len = a_domain.length();
+    amrex::IntVect const a_domain_len = a_domain.length();
 
     AMREX_ALWAYS_ASSERT(a_domain_len[2] == 1 && a_domain.cellCentered() &&
                         a_domain_len[0]%2 == a_domain_len[1]%2);
 
-    IndexType const index_type = (a_domain_len[0]%2 == 0) ?
-        IndexType::TheCellType() : IndexType(IntVect(1,1,0));
-    m_domain.push_back(amrex::makeSlab(Box{{0,0,0}, a_domain_len-1, index_type}, 2, 0));
+    amrex::IndexType const index_type = (a_domain_len[0]%2 == 0) ?
+        amrex::IndexType::TheCellType() : amrex::IndexType(amrex::IntVect(1,1,0));
+    m_domain.push_back(amrex::makeSlab(amrex::Box{{0,0,0}, a_domain_len-1, index_type}, 2, 0));
     if (!index_type.cellCentered()) {
         m_domain[0].growHi(0,2).growHi(1,2);
         m_boundary_condition_offset = 1.;
@@ -1064,18 +1074,19 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type)
         m_boundary_condition_offset = 0.5;
         m_boundary_condition_factor = 8./3.;
     }
-    IntVect const min_width = index_type.cellCentered() ? IntVect(2,2,1) : IntVect(4,4,1);
+    amrex::IntVect const min_width = index_type.cellCentered() ?
+        amrex::IntVect(2,2,1) : amrex::IntVect(4,4,1);
     for (int i = 0; i < 30; ++i) {
-        if (m_domain.back().coarsenable(IntVect(2,2,1), min_width)) {
-            m_domain.push_back(amrex::coarsen(m_domain.back(),IntVect(2,2,1)));
+        if (m_domain.back().coarsenable(amrex::IntVect(2,2,1), min_width)) {
+            m_domain.push_back(amrex::coarsen(m_domain.back(),amrex::IntVect(2,2,1)));
         } else {
             break;
         }
     }
-    m_max_level = m_domain.size()-1;
+    m_max_level = static_cast<int>(m_domain.size()-1);
 #if defined(AMREX_USE_GPU)
     auto r = std::find_if(std::begin(m_domain), std::end(m_domain),
-                          [=] (Box const& b) -> bool
+                          [=] (amrex::Box const& b) -> bool
                               { return b.volume() <= n_cell_single*n_cell_single; });
     m_single_block_level_begin = std::distance(std::begin(m_domain), r);
     m_single_block_level_begin = std::max(1, m_single_block_level_begin);
@@ -1087,7 +1098,8 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type)
             << " cannot be coarsened enough times to be solved efficiently.\n"
             << "hpmg: Size of the final MG level: "
             << m_domain[m_max_level].length(0) << " " << m_domain[m_max_level].length(1) << ".\n"
-            << "hpmg: Please consider using a domain size of the form '2^n', '3*2^n', '2^n+1' or '3*n^2+1'.\n";
+            << "hpmg: Please consider using a domain size of the form "
+            << "'2^n', '3*2^n', '2^n+1' or '3*n^2+1'.\n";
     }
 #else
     m_single_block_level_begin = m_max_level;
@@ -1120,7 +1132,7 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type)
         } else {
             m_res.emplace_back(m_domain[ilev], m_num_comps);
             if (!index_type.cellCentered()) {
-                m_res[ilev].template setVal<RunOn::Device>(0);
+                m_res[ilev].template setVal<amrex::RunOn::Device>(0);
             }
             if (ilev >= m_single_block_level_begin) {
                 m_array4.push_back(m_res[ilev].array());
@@ -1133,7 +1145,7 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type)
     for (int ilev = 0; ilev < m_num_mg_levels; ++ilev) {
         m_cor.emplace_back(m_domain[ilev], m_num_comps);
         if (!index_type.cellCentered()) {
-            m_cor[ilev].template setVal<RunOn::Device>(0);
+            m_cor[ilev].template setVal<amrex::RunOn::Device>(0);
         }
         if (ilev >= m_single_block_level_begin) {
             m_array4.push_back(m_cor[ilev].array());
@@ -1145,14 +1157,14 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type)
     for (int ilev = 0; ilev < m_num_mg_levels; ++ilev) {
         m_rescor.emplace_back(m_domain[ilev], m_num_comps);
         if (!index_type.cellCentered()) {
-            m_rescor[ilev].template setVal<RunOn::Device>(0);
+            m_rescor[ilev].template setVal<amrex::RunOn::Device>(0);
         }
         if (ilev >= m_single_block_level_begin) {
             m_array4.push_back(m_rescor[ilev].array());
         }
     }
 
-    // Copy Array4s to GPU for bottomsolve_gpu()
+    // Copy amrex::Array4s to GPU for bottomsolve_gpu()
     if (!m_array4.empty()) {
         m_array4.copyToDeviceAsync();
         m_acf_a = m_array4.data();
@@ -1167,14 +1179,14 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type)
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 void
-MultiGrid::solve1 (FArrayBox& a_sol, FArrayBox const& a_rhs, FArrayBox const& a_acf,
-                   Real const tol_rel, Real const tol_abs, int const nummaxiter,
-                   int const verbose)
+MultiGrid::solve1 (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs,
+                   amrex::FArrayBox const& a_acf, amrex::Real const tol_rel,
+                   amrex::Real const tol_abs, int const nummaxiter, int const verbose)
 {
     HIPACE_PROFILE("hpmg::MultiGrid::solve1()");
     AMREX_ALWAYS_ASSERT(m_system_type == 1);
 
-    FArrayBox afab(center_box(a_acf.box(), m_domain.front()), 1, a_acf.dataPtr());
+    amrex::FArrayBox afab(center_box(a_acf.box(), m_domain.front()), 1, a_acf.dataPtr());
 
     auto const& array_m_acf = m_acf[0].array();
     auto const& array_a_acf = afab.const_array();
@@ -1290,8 +1302,8 @@ MultiGrid::solve2 (amrex::FArrayBox& sol, amrex::FArrayBox const& rhs,
 }
 
 void
-MultiGrid::solve3 (FArrayBox& a_sol, FArrayBox const& a_rhs,
-                   Real const tol_rel, Real const tol_abs, int const nummaxiter,
+MultiGrid::solve3 (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs,
+                   amrex::Real const tol_rel, amrex::Real const tol_abs, int const nummaxiter,
                    int const verbose)
 {
     HIPACE_PROFILE("hpmg::MultiGrid::solve3()");
@@ -1305,14 +1317,16 @@ MultiGrid::solve3 (FArrayBox& a_sol, FArrayBox const& a_rhs,
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 void
-MultiGrid::solve_doit (FArrayBox& a_sol, FArrayBox const& a_rhs,
-                       Real const tol_rel, Real const tol_abs, int const nummaxiter,
+MultiGrid::solve_doit (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs,
+                       amrex::Real const tol_rel, amrex::Real const tol_abs, int const nummaxiter,
                        int const verbose)
 {
     AMREX_ALWAYS_ASSERT(a_sol.nComp() >= m_num_comps && a_rhs.nComp() >= m_num_comps);
 
-    m_sol = FArrayBox(center_box(a_sol.box(), m_domain.front()), m_num_comps, a_sol.dataPtr());
-    m_rhs = FArrayBox(center_box(a_rhs.box(), m_domain.front()), m_num_comps, a_rhs.dataPtr());
+    m_sol = amrex::FArrayBox(center_box(a_sol.box(), m_domain.front()), m_num_comps,
+                                        a_sol.dataPtr());
+    m_rhs = amrex::FArrayBox(center_box(a_rhs.box(), m_domain.front()), m_num_comps,
+                                        a_rhs.dataPtr());
 
     // sol: initial solution guess
     // rhs: right hand side of differential equation to solve
@@ -1327,10 +1341,10 @@ MultiGrid::solve_doit (FArrayBox& a_sol, FArrayBox const& a_rhs,
         m_acf[0].const_array(), m_rescor[0].array(), m_sol.array(), m_dx, m_dy);
 
     // Reduction on the right-hand side and residual to get convergence criteria
-    Real resnorm0, rhsnorm0;
+    amrex::Real resnorm0, rhsnorm0;
     {
-        ReduceOps<ReduceOpMax,ReduceOpMax> reduce_op;
-        ReduceData<Real,Real> reduce_data(reduce_op);
+        amrex::ReduceOps<amrex::ReduceOpMax,amrex::ReduceOpMax> reduce_op;
+        amrex::ReduceData<amrex::Real,amrex::Real> reduce_data(reduce_op);
         using ReduceTuple = typename decltype(reduce_data)::Type;
         const auto& array_res = m_rescor[0].const_array();
         const auto& array_rhs = m_rhs.const_array();
@@ -1349,7 +1363,7 @@ MultiGrid::solve_doit (FArrayBox& a_sol, FArrayBox const& a_rhs,
                        << "hpmg: Initial residual (resid0) = " << resnorm0 << "\n";
     }
 
-    Real max_norm;
+    amrex::Real max_norm;
     std::string norm_name;
     if (rhsnorm0 >= resnorm0) {
         norm_name = "bnorm";
@@ -1358,28 +1372,26 @@ MultiGrid::solve_doit (FArrayBox& a_sol, FArrayBox const& a_rhs,
         norm_name = "resid0";
         max_norm = resnorm0;
     }
-    const Real res_target = std::max(tol_abs, std::max(tol_rel,Real(1.e-16))*max_norm);
+    const amrex::Real res_target = std::max(tol_abs,std::max(tol_rel,amrex::Real(1.e-16))*max_norm);
 
     if (resnorm0 <= res_target) {
         if (verbose >= 1) {
             amrex::Print() << "hpmg: No iterations needed\n";
         }
     } else {
-        Real norminf = 0.;
+        amrex::Real norminf = 0.;
         bool converged = true;
 
         for (int iter = 0; iter < nummaxiter; ++iter) {
 
-            converged = false;
-
             // do one vcycle iteration with the fist 4 Gauss-Seidel iterations omitted
             // from the beginning and instead done after the vcycle to also get the residual
             vcycle();
 
             // check the residual for convergence
-            Real const* pres0 = m_rescor[0].dataPtr();
-            norminf = Reduce::Max<Real>(m_domain[0].numPts()*m_num_comps,
-                                        [=] AMREX_GPU_DEVICE (Long i) -> Real
+            amrex::Real const* pres0 = m_rescor[0].dataPtr();
+            norminf = amrex::Reduce::Max<amrex::Real>(m_domain[0].numPts()*m_num_comps,
+                                        [=] AMREX_GPU_DEVICE (amrex::Long i) -> amrex::Real
                                         {
                                             return std::abs(pres0[i]);
                                         });
@@ -1399,7 +1411,7 @@ MultiGrid::solve_doit (FArrayBox& a_sol, FArrayBox const& a_rhs,
             }
 
             AMREX_ALWAYS_ASSERT_WITH_MESSAGE(
-                converged || (norminf <= Real(1.e20)*max_norm),
+                converged || (norminf <= amrex::Real(1.e20)*max_norm),
                 "hpmg: Failing to converge after " + amrex::ToString(iter+1)  + " iterations."
                 " resid, resid/" + norm_name + " = " + amrex::ToString(norminf) + ", " +
                 amrex::ToString(norminf/max_norm) + "\n"
@@ -1434,14 +1446,14 @@ MultiGrid::vcycle ()
     };
 
     if (m_cuda_graph_vcycle.count(key) == 0) {
-        cudaStreamBeginCapture(Gpu::gpuStream(), cudaStreamCaptureModeGlobal);
+        cudaStreamBeginCapture(amrex::Gpu::gpuStream(), cudaStreamCaptureModeGlobal);
 #endif
 
     for (int ilev = 0; ilev < m_single_block_level_begin; ++ilev) {
 
-        Real fac = static_cast<Real>(1 << ilev);
-        Real dx = m_dx * fac;
-        Real dy = m_dy * fac;
+        amrex::Real fac = static_cast<amrex::Real>(1 << ilev);
+        amrex::Real dx = m_dx * fac;
+        amrex::Real dy = m_dy * fac;
 
         if (ilev > 0) {
             // cor and residual on ilev 0 are already calculated before the vcycle is started
@@ -1460,16 +1472,17 @@ MultiGrid::vcycle ()
         // interpolate residual to next level
 
         // res[ilev+1] = R(rescor[ilev])
-        restriction(m_domain[ilev+1], m_res[ilev+1].array(), m_rescor[ilev].const_array(), m_num_comps);
+        restriction(m_domain[ilev+1], m_res[ilev+1].array(), m_rescor[ilev].const_array(),
+                    m_num_comps);
     }
 
     bottomsolve();
 
     for (int ilev = m_single_block_level_begin-1; ilev >= 0; --ilev) {
 
-        Real fac = static_cast<Real>(1 << ilev);
-        Real dx = m_dx * fac;
-        Real dy = m_dy * fac;
+        amrex::Real fac = static_cast<amrex::Real>(1 << ilev);
+        amrex::Real dx = m_dx * fac;
+        amrex::Real dy = m_dy * fac;
 
         // interpolate solution from previous level to phi
 
@@ -1501,11 +1514,11 @@ MultiGrid::vcycle ()
         m_acf[0].const_array(), m_rescor[0].array(), m_sol.array(), m_dx, m_dy);
 
 #if defined(AMREX_USE_CUDA)
-        cudaStreamEndCapture(Gpu::gpuStream(), &m_cuda_graph_vcycle[key].first);
+        cudaStreamEndCapture(amrex::Gpu::gpuStream(), &m_cuda_graph_vcycle[key].first);
         cudaGraphInstantiate(&m_cuda_graph_vcycle[key].second,
                             m_cuda_graph_vcycle[key].first, NULL, NULL, 0);
     }
-    cudaGraphLaunch(m_cuda_graph_vcycle[key].second, Gpu::gpuStream());
+    cudaGraphLaunch(m_cuda_graph_vcycle[key].second, amrex::Gpu::gpuStream());
 #endif
 }
 
@@ -1513,74 +1526,95 @@ void
 MultiGrid::bottomsolve ()
 {
     constexpr int nsweeps = 16;
-    Real fac = static_cast<Real>(1 << m_single_block_level_begin);
-    Real dx0 = m_dx * fac;
-    Real dy0 = m_dy * fac;
+    amrex::Real fac = static_cast<amrex::Real>(1 << m_single_block_level_begin);
+    amrex::Real dx0 = m_dx * fac;
+    amrex::Real dy0 = m_dy * fac;
 #if defined(AMREX_USE_GPU)
     if (m_use_single_block_kernel) {
         int nlevs = m_num_single_block_levels;
         int const corner_offset = m_domain[0].cellCentered() ? 0 : 1;
 
         if (m_system_type == 1) {
-            bottomsolve_gpu<nsweeps,1>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, corner_offset,
+            bottomsolve_gpu<nsweeps,1>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs,
+                                       corner_offset,
                 [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi,
-                                      Array4<Real> const& phi, Array4<Real> const& rhs,
-                                      Array4<Real> const& acf, Real facx, Real facy)
+                                     amrex::Array4<amrex::Real> const& phi,
+                                     amrex::Array4<amrex::Real> const& rhs,
+                                     amrex::Array4<amrex::Real> const& acf,
+                                     amrex::Real facx, amrex::Real facy)
                 {
-                    Real a = acf(i,j,0);
+                    amrex::Real a = acf(i,j,0);
                     gs1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), a, facx, facy);
                     gs1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), a, facx, facy);
                 },
-                [] AMREX_GPU_DEVICE (int i, int j, Array4<Real> const& res,
-                                      int ilo, int jlo, int ihi, int jhi,
-                                      Array4<Real> const& phi, Array4<Real> const& rhs,
-                                      Array4<Real> const& acf, Real facx, Real facy)
+                [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4<amrex::Real> const& res,
+                                     int ilo, int jlo, int ihi, int jhi,
+                                     amrex::Array4<amrex::Real> const& phi,
+                                     amrex::Array4<amrex::Real> const& rhs,
+                                     amrex::Array4<amrex::Real> const& acf,
+                                     amrex::Real facx, amrex::Real facy)
                 {
-                    Real a = acf(i,j,0);
-                    res(i,j,0,0) = residual1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), a, facx, facy);
-                    res(i,j,0,1) = residual1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), a, facx, facy);
+                    amrex::Real a = acf(i,j,0);
+                    res(i,j,0,0) = residual1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), a,
+                                             facx, facy);
+                    res(i,j,0,1) = residual1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), a,
+                                             facx, facy);
                 });
         } else if (m_system_type == 2) {
-            bottomsolve_gpu<nsweeps,2>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, corner_offset,
+            bottomsolve_gpu<nsweeps,2>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs,
+                                       corner_offset,
                 [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi,
-                                      Array4<Real> const& phi, Array4<Real> const& rhs,
-                                      Array4<Real> const& acf, Real facx, Real facy)
+                                     amrex::Array4<amrex::Real> const& phi,
+                                     amrex::Array4<amrex::Real> const& rhs,
+                                     amrex::Array4<amrex::Real> const& acf,
+                                     amrex::Real facx, amrex::Real facy)
                 {
-                    Real ar = acf(i,j,0,0);
-                    Real ai = acf(i,j,0,1);
-                    gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), rhs(i,j,0,1), ar, ai, facx, facy);
+                    amrex::Real ar = acf(i,j,0,0);
+                    amrex::Real ai = acf(i,j,0,1);
+                    gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), rhs(i,j,0,1), ar, ai,
+                        facx, facy);
                 },
-                [] AMREX_GPU_DEVICE (int i, int j, Array4<Real> const& res,
-                                      int ilo, int jlo, int ihi, int jhi,
-                                      Array4<Real> const& phi, Array4<Real> const& rhs,
-                                      Array4<Real> const& acf, Real facx, Real facy)
+                [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4<amrex::Real> const& res,
+                                     int ilo, int jlo, int ihi, int jhi,
+                                     amrex::Array4<amrex::Real> const& phi,
+                                     amrex::Array4<amrex::Real> const& rhs,
+                                     amrex::Array4<amrex::Real> const& acf,
+                                     amrex::Real facx, amrex::Real facy)
                 {
-                    Real ar = acf(i,j,0,0);
-                    Real ai = acf(i,j,0,1);
-                    res(i,j,0,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), ar, ai, facx, facy);
-                    res(i,j,0,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), ar, ai, facx, facy);
+                    amrex::Real ar = acf(i,j,0,0);
+                    amrex::Real ai = acf(i,j,0,1);
+                    res(i,j,0,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), ar, ai,
+                                              facx, facy);
+                    res(i,j,0,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), ar, ai,
+                                              facx, facy);
                 });
         } else {
-            bottomsolve_gpu<nsweeps,3>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, corner_offset,
+            bottomsolve_gpu<nsweeps,3>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs,
+                                       corner_offset,
                 [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi,
-                                      Array4<Real> const& phi, Array4<Real> const& rhs,
-                                      Array4<Real> const&, Real facx, Real facy)
+                                     amrex::Array4<amrex::Real> const& phi,
+                                     amrex::Array4<amrex::Real> const& rhs,
+                                     amrex::Array4<amrex::Real> const&, amrex::Real facx,
+                                     amrex::Real facy)
                 {
                     gs3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), facx, facy);
                 },
-                [] AMREX_GPU_DEVICE (int i, int j, Array4<Real> const& res,
-                                      int ilo, int jlo, int ihi, int jhi,
-                                      Array4<Real> const& phi, Array4<Real> const& rhs,
-                                      Array4<Real> const&, Real facx, Real facy)
+                [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4<amrex::Real> const& res,
+                                     int ilo, int jlo, int ihi, int jhi,
+                                     amrex::Array4<amrex::Real> const& phi,
+                                     amrex::Array4<amrex::Real> const& rhs,
+                                     amrex::Array4<amrex::Real> const&, amrex::Real facx,
+                                     amrex::Real facy)
                 {
-                    res(i,j,0,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), facx, facy);
+                    res(i,j,0,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0),
+                                             facx, facy);
                 });
         }
     } else
 #endif
     {
         const int ilev = m_single_block_level_begin;
-        m_cor[ilev].setVal<amrex::RunOn::Device>(Real(0.));
+        m_cor[ilev].setVal<amrex::RunOn::Device>(amrex::Real(0.));
         // Use numsweeps equal to the box length rounded up to an even number for large boxes
         int numsweeps = std::max(nsweeps, (m_cor[ilev].box().length().max() + 1) / 2 * 2);
         for (int is = 0; is < numsweeps; ++is) {
@@ -1598,13 +1632,13 @@ MultiGrid::bottomsolve ()
 #if defined(AMREX_USE_GPU)
 namespace {
     template <typename F>
-    void avgdown_acf (Array4<Real> const* acf, int ncomp, int nlevels, F&& f)
+    void avgdown_acf (amrex::Array4<amrex::Real> const* acf, int ncomp, int nlevels, F&& f)
     {
 #if defined(AMREX_USE_DPCPP)
-        amrex::launch(1, 1024, Gpu::gpuStream(),
+        amrex::launch(1, 1024, amrex::Gpu::gpuStream(),
         [=] (sycl::nd_item<1> const& item) noexcept
 #else
-        amrex::launch_global<1024><<<1, 1024, 0, Gpu::gpuStream()>>>(
+        amrex::launch_global<1024><<<1, 1024, 0, amrex::Gpu::gpuStream()>>>(
         [=] AMREX_GPU_DEVICE () noexcept
 #endif
         {
@@ -1640,7 +1674,7 @@ MultiGrid::average_down_acoef ()
 {
 #if defined(AMREX_USE_CUDA)
     if (!m_cuda_graph_acf_created) {
-    cudaStreamBeginCapture(Gpu::gpuStream(), cudaStreamCaptureModeGlobal);
+    cudaStreamBeginCapture(amrex::Gpu::gpuStream(), cudaStreamCaptureModeGlobal);
 #endif
 
     for (int ilev = 1; ilev <= m_single_block_level_begin; ++ilev) {
@@ -1665,21 +1699,23 @@ MultiGrid::average_down_acoef ()
     if (m_num_single_block_levels > 1) {
         if (m_domain[0].cellCentered()) {
             avgdown_acf(m_acf_a, m_num_comps_acf, m_num_single_block_levels,
-                        [] AMREX_GPU_DEVICE (int i, int j, int n, Array4<Real> const& crse,
-                                             Array4<Real> const& fine) noexcept
+                        [] AMREX_GPU_DEVICE (int i, int j, int n,
+                                             amrex::Array4<amrex::Real> const& crse,
+                                             amrex::Array4<amrex::Real> const& fine) noexcept
                         {
                             restrict_cc(i,j,n,crse,fine);
                         });
         } else {
             avgdown_acf(m_acf_a, m_num_comps_acf, m_num_single_block_levels,
-                        [] AMREX_GPU_DEVICE (int i, int j, int n, Array4<Real> const& crse,
-                                             Array4<Real> const& fine) noexcept
+                        [] AMREX_GPU_DEVICE (int i, int j, int n,
+                                             amrex::Array4<amrex::Real> const& crse,
+                                             amrex::Array4<amrex::Real> const& fine) noexcept
                         {
                             if (i == crse.begin.x ||
                                 j == crse.begin.y ||
                                 i == crse.end.x-1 ||
                                 j == crse.end.y-1) {
-                                crse(i,j,0,n) = Real(0.);
+                                crse(i,j,0,n) = amrex::Real(0.);
                             } else {
                                 restrict_nd(i,j,n,crse,fine);
                             }
@@ -1689,11 +1725,11 @@ MultiGrid::average_down_acoef ()
 #endif
 
 #if defined(AMREX_USE_CUDA)
-    cudaStreamEndCapture(Gpu::gpuStream(), &m_cuda_graph_acf);
+    cudaStreamEndCapture(amrex::Gpu::gpuStream(), &m_cuda_graph_acf);
     cudaGraphInstantiate(&m_cuda_graph_exe_acf, m_cuda_graph_acf, NULL, NULL, 0);
     m_cuda_graph_acf_created = true;
     }
-    cudaGraphLaunch(m_cuda_graph_exe_acf, Gpu::gpuStream());
+    cudaGraphLaunch(m_cuda_graph_exe_acf, amrex::Gpu::gpuStream());
 #endif
 }
 

From ed521dfc3c0d7c967e9dea0f3fd613638e272812 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Mon, 12 Jan 2026 17:03:38 +0100
Subject: [PATCH 2/5] Use Array3 in HPMG solver

---
 src/mg_solver/HpMultiGrid.H   |  12 +-
 src/mg_solver/HpMultiGrid.cpp | 700 ++++++++++++++++++----------------
 2 files changed, 368 insertions(+), 344 deletions(-)

diff --git a/src/mg_solver/HpMultiGrid.H b/src/mg_solver/HpMultiGrid.H
index f8c0ca9d39..a387c0cf8e 100644
--- a/src/mg_solver/HpMultiGrid.H
+++ b/src/mg_solver/HpMultiGrid.H
@@ -257,13 +257,15 @@ private:
     amrex::Vector<amrex::FArrayBox> m_rescor;
 
     /** Device pointer to Array4s used by the single-block kernel at the bottom */
-    amrex::Array4<amrex::Real> const* m_acf_a = nullptr;
-    amrex::Array4<amrex::Real> const* m_res_a = nullptr;
-    amrex::Array4<amrex::Real> const* m_cor_a = nullptr;
-    amrex::Array4<amrex::Real> const* m_rescor_a = nullptr;
+    Array3<amrex::Real> const* m_acf_a = nullptr;
+    Array3<amrex::Real> const* m_res_a = nullptr;
+    Array3<amrex::Real> const* m_cor_a = nullptr;
+    Array3<amrex::Real> const* m_rescor_a = nullptr;
 
     /** Device vector of Array4s used by the single-block kernel at the bottom */
-    amrex::Gpu::Buffer<amrex::Array4<amrex::Real> > m_array4;
+    amrex::Gpu::Buffer<Array3<amrex::Real>> m_array3;
+    /** Device vector of Boxes used by the single-block kernel at the bottom */
+    amrex::Gpu::Buffer<amrex::BoxND<2>> m_lev_domain;
 
 #if defined(AMREX_USE_CUDA)
     /** CUDA graphs for average-down */
diff --git a/src/mg_solver/HpMultiGrid.cpp b/src/mg_solver/HpMultiGrid.cpp
index a8bca6e1d1..aeea4c59ed 100644
--- a/src/mg_solver/HpMultiGrid.cpp
+++ b/src/mg_solver/HpMultiGrid.cpp
@@ -28,100 +28,100 @@ amrex::Box valid_domain_box (amrex::Box const& domain)
 
 template <typename T, typename U>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void restrict_cc (int i, int j, int n, amrex::Array4<T> const& crse, amrex::Array4<U> const& fine)
+void restrict_cc (int i, int j, int n, Array3<T> const& crse, Array3<U> const& fine)
 {
-    crse(i,j,0,n) = amrex::Real(0.25)*(fine(2*i  ,2*j  ,0,n) +
-                                fine(2*i+1,2*j  ,0,n) +
-                                fine(2*i  ,2*j+1,0,n) +
-                                fine(2*i+1,2*j+1,0,n));
+    crse(i,j,n) = amrex::Real(0.25)*(fine(2*i  ,2*j  ,n) +
+                                     fine(2*i+1,2*j  ,n) +
+                                     fine(2*i  ,2*j+1,n) +
+                                     fine(2*i+1,2*j+1,n));
 }
 
 template <typename T, typename U>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void restrict_nd (int i, int j, int n, amrex::Array4<T> const& crse, amrex::Array4<U> const& fine)
+void restrict_nd (int i, int j, int n, Array3<T> const& crse, Array3<U> const& fine)
 {
-    crse(i,j,0,n) = amrex::Real(1./16.) * (fine(2*i-1,2*j-1,0,n) +
-                           amrex::Real(2.)*fine(2*i  ,2*j-1,0,n) +
-                                    fine(2*i+1,2*j-1,0,n) +
-                           amrex::Real(2.)*fine(2*i-1,2*j  ,0,n) +
-                           amrex::Real(4.)*fine(2*i  ,2*j  ,0,n) +
-                           amrex::Real(2.)*fine(2*i+1,2*j  ,0,n) +
-                                    fine(2*i-1,2*j+1,0,n) +
-                           amrex::Real(2.)*fine(2*i  ,2*j+1,0,n) +
-                                    fine(2*i+1,2*j+1,0,n));
+    crse(i,j,n) = amrex::Real(1./16.) * (fine(2*i-1,2*j-1,n) +
+                         amrex::Real(2.)*fine(2*i  ,2*j-1,n) +
+                                         fine(2*i+1,2*j-1,n) +
+                         amrex::Real(2.)*fine(2*i-1,2*j  ,n) +
+                         amrex::Real(4.)*fine(2*i  ,2*j  ,n) +
+                         amrex::Real(2.)*fine(2*i+1,2*j  ,n) +
+                                         fine(2*i-1,2*j+1,n) +
+                         amrex::Real(2.)*fine(2*i  ,2*j+1,n) +
+                                         fine(2*i+1,2*j+1,n));
 }
 
 template <typename T, typename U>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void interpadd_cc (int i, int j, int n, amrex::Array4<T> const& fine, amrex::Array4<U> const& crse)
+void interpadd_cc (int i, int j, int n, Array3<T> const& fine, Array3<U> const& crse)
 {
     int ic = amrex::coarsen(i,2);
     int jc = amrex::coarsen(j,2);
-    fine(i,j,0,n) += crse(ic,jc,0,n);
+    fine(i,j,n) += crse(ic,jc,n);
 }
 
 template <typename T, typename U>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void interpadd_nd (int i, int j, int n, amrex::Array4<T> const& fine, amrex::Array4<U> const& crse)
+void interpadd_nd (int i, int j, int n, Array3<T> const& fine, Array3<U> const& crse)
 {
     int ic = amrex::coarsen(i,2);
     int jc = amrex::coarsen(j,2);
     bool i_is_odd = (ic*2 != i);
     bool j_is_odd = (jc*2 != j);
     if (i_is_odd && j_is_odd) {
-        fine(i,j,0,n) += (crse(ic  ,jc  ,0,n) +
-                          crse(ic+1,jc  ,0,n) +
-                          crse(ic  ,jc+1,0,n) +
-                          crse(ic+1,jc+1,0,n))*amrex::Real(0.25);
+        fine(i,j,n) += (crse(ic  ,jc  ,n) +
+                        crse(ic+1,jc  ,n) +
+                        crse(ic  ,jc+1,n) +
+                        crse(ic+1,jc+1,n))*amrex::Real(0.25);
     } else if (i_is_odd) {
-        fine(i,j,0,n) += (crse(ic  ,jc,0,n) +
-                          crse(ic+1,jc,0,n))*amrex::Real(0.5);
+        fine(i,j,n) += (crse(ic  ,jc,n) +
+                        crse(ic+1,jc,n))*amrex::Real(0.5);
     } else if (j_is_odd) {
-        fine(i,j,0,n) += (crse(ic,jc  ,0,n) +
-                          crse(ic,jc+1,0,n))*amrex::Real(0.5);
+        fine(i,j,n) += (crse(ic,jc  ,n) +
+                        crse(ic,jc+1,n))*amrex::Real(0.5);
     } else {
-        fine(i,j,0,n) += crse(ic,jc,0,n);
+        fine(i,j,n) += crse(ic,jc,n);
     }
 }
 
 // out of place version used before shared memory gsrb
 template <typename T, typename U, typename V>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void interpcpy_cc (int i, int j, int n, amrex::Array4<T> const& fine_in,
-                   amrex::Array4<U> const& crse, amrex::Array4<V> const& fine_out)
+void interpcpy_cc (int i, int j, int n, Array3<T> const& fine_in,
+                   Array3<U> const& crse, Array3<V> const& fine_out)
 {
     int ic = amrex::coarsen(i,2);
     int jc = amrex::coarsen(j,2);
-    fine_out(i,j,0,n) = fine_in(i,j,0,n) + crse(ic,jc,0,n);
+    fine_out(i,j,n) = fine_in(i,j,n) + crse(ic,jc,n);
 }
 
 template <typename T, typename U, typename V>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-void interpcpy_nd (int i, int j, int n, amrex::Array4<T> const& fine_in,
-                   amrex::Array4<U> const& crse, amrex::Array4<V> const& fine_out)
+void interpcpy_nd (int i, int j, int n, Array3<T> const& fine_in,
+                   Array3<U> const& crse, Array3<V> const& fine_out)
 {
     int ic = amrex::coarsen(i,2);
     int jc = amrex::coarsen(j,2);
     bool i_is_odd = (ic*2 != i);
     bool j_is_odd = (jc*2 != j);
     if (i_is_odd && j_is_odd) {
-        fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic  ,jc  ,0,n) +
-                                                crse(ic+1,jc  ,0,n) +
-                                                crse(ic  ,jc+1,0,n) +
-                                                crse(ic+1,jc+1,0,n))*amrex::Real(0.25);
+        fine_out(i,j,n) = fine_in(i,j,n) + (crse(ic  ,jc  ,n) +
+                                            crse(ic+1,jc  ,n) +
+                                            crse(ic  ,jc+1,n) +
+                                            crse(ic+1,jc+1,n))*amrex::Real(0.25);
     } else if (i_is_odd) {
-        fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic  ,jc,0,n) +
-                                                crse(ic+1,jc,0,n))*amrex::Real(0.5);
+        fine_out(i,j,n) = fine_in(i,j,n) + (crse(ic  ,jc,n) +
+                                            crse(ic+1,jc,n))*amrex::Real(0.5);
     } else if (j_is_odd) {
-        fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic,jc  ,0,n) +
-                                                crse(ic,jc+1,0,n))*amrex::Real(0.5);
+        fine_out(i,j,n) = fine_in(i,j,n) + (crse(ic,jc  ,n) +
+                                            crse(ic,jc+1,n))*amrex::Real(0.5);
     } else {
-        fine_out(i,j,0,n) = fine_in(i,j,0,n) +  crse(ic,jc,0,n);
+        fine_out(i,j,n) = fine_in(i,j,n) +  crse(ic,jc,n);
     }
 }
 
-void restriction (amrex::Box const& box, amrex::Array4<amrex::Real> const& crse,
-                  amrex::Array4<amrex::Real const> const& fine, int num_comps)
+void restriction (amrex::Box const& box, Array3<amrex::Real> const& crse,
+                  Array3<amrex::Real const> const& fine, int num_comps)
 {
     if (box.cellCentered()) {
         hpmg::ParallelFor(to2D(box), num_comps, [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept
@@ -138,9 +138,9 @@ void restriction (amrex::Box const& box, amrex::Array4<amrex::Real> const& crse,
 }
 
 void interpolation_outofplace (amrex::Box const& box,
-                               amrex::Array4<amrex::Real const> const& fine_in,
-                               amrex::Array4<amrex::Real const> const& crse,
-                               amrex::Array4<amrex::Real> const& fine_out, int num_comps)
+                               Array3<amrex::Real const> const& fine_in,
+                               Array3<amrex::Real const> const& crse,
+                               Array3<amrex::Real> const& fine_out, int num_comps)
 {
     if (box.cellCentered()) {
         hpmg::ParallelFor(to2D(box), num_comps, [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept
@@ -162,58 +162,58 @@ void interpolation_outofplace (amrex::Box const& box,
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 amrex::Real laplacian (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
-                amrex::Array4<amrex::Real> const& phi, amrex::Real facx, amrex::Real facy)
+                Array3<amrex::Real> const& phi, amrex::Real facx, amrex::Real facy)
 {
-    amrex::Real lap = amrex::Real(-2.)*(facx+facy)*phi(i,j,0,n);
+    amrex::Real lap = amrex::Real(-2.)*(facx+facy)*phi(i,j,n);
     if (i == ilo) {
-        lap += facx * (amrex::Real(4./3.)*phi(i+1,j,0,n) - amrex::Real(2.)*phi(i,j,0,n));
+        lap += facx * (amrex::Real(4./3.)*phi(i+1,j,n) - amrex::Real(2.)*phi(i,j,n));
     } else if (i == ihi) {
-        lap += facx * (amrex::Real(4./3.)*phi(i-1,j,0,n) - amrex::Real(2.)*phi(i,j,0,n));
+        lap += facx * (amrex::Real(4./3.)*phi(i-1,j,n) - amrex::Real(2.)*phi(i,j,n));
     } else {
-        lap += facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n));
+        lap += facx * (phi(i-1,j,n) + phi(i+1,j,n));
     }
     if (j == jlo) {
-        lap += facy * (amrex::Real(4./3.)*phi(i,j+1,0,n) - amrex::Real(2.)*phi(i,j,0,n));
+        lap += facy * (amrex::Real(4./3.)*phi(i,j+1,n) - amrex::Real(2.)*phi(i,j,n));
     } else if (j == jhi) {
-        lap += facy * (amrex::Real(4./3.)*phi(i,j-1,0,n) - amrex::Real(2.)*phi(i,j,0,n));
+        lap += facy * (amrex::Real(4./3.)*phi(i,j-1,n) - amrex::Real(2.)*phi(i,j,n));
     } else {
-        lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n));
+        lap += facy * (phi(i,j-1,n) + phi(i,j+1,n));
     }
     return lap;
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 amrex::Real residual1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
-                       amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf,
+                       Array3<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf,
                        amrex::Real facx, amrex::Real facy)
 {
     amrex::Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy);
-    return rhs + acf*phi(i,j,0,n) - lap;
+    return rhs + acf*phi(i,j,n) - lap;
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 amrex::Real residual2r (int i, int j, int ilo, int jlo, int ihi, int jhi,
-                        amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf_r,
+                        Array3<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf_r,
                         amrex::Real acf_i,
                         amrex::Real facx, amrex::Real facy)
 {
     amrex::Real lap = laplacian(i,j,0,ilo,jlo,ihi,jhi,phi,facx,facy);
-    return rhs + acf_r*phi(i,j,0,0) - acf_i*phi(i,j,0,1) - lap;
+    return rhs + acf_r*phi(i,j,0) - acf_i*phi(i,j,1) - lap;
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 amrex::Real residual2i (int i, int j, int ilo, int jlo, int ihi, int jhi,
-                        amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf_r,
+                        Array3<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf_r,
                         amrex::Real acf_i,
                         amrex::Real facx, amrex::Real facy)
 {
     amrex::Real lap = laplacian(i,j,1,ilo,jlo,ihi,jhi,phi,facx,facy);
-    return rhs + acf_i*phi(i,j,0,0) + acf_r*phi(i,j,0,1) - lap;
+    return rhs + acf_i*phi(i,j,0) + acf_r*phi(i,j,1) - lap;
 }
 
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 amrex::Real residual3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
-                       amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real facx,
+                       Array3<amrex::Real> const& phi, amrex::Real rhs, amrex::Real facx,
                        amrex::Real facy)
 {
     amrex::Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy);
@@ -222,10 +222,10 @@ amrex::Real residual3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
 
 #if !defined(AMREX_USE_CUDA) && !defined(AMREX_USE_HIP) && defined(AMREX_USE_GPU)
 // res = rhs - L(phi)
-void compute_residual (amrex::Box const& box, amrex::Array4<amrex::Real> const& res,
-                       amrex::Array4<amrex::Real> const& phi,
-                       amrex::Array4<amrex::Real const> const& rhs,
-                       amrex::Array4<amrex::Real const> const& acf, amrex::Real dx, amrex::Real dy,
+void compute_residual (amrex::Box const& box, Array3<amrex::Real> const& res,
+                       Array3<amrex::Real> const& phi,
+                       Array3<amrex::Real const> const& rhs,
+                       Array3<amrex::Real const> const& acf, amrex::Real dx, amrex::Real dy,
                        int system_type)
 {
     int const ilo = box.smallEnd(0);
@@ -238,25 +238,25 @@ void compute_residual (amrex::Box const& box, amrex::Array4<amrex::Real> const&
         hpmg::ParallelFor(to2D(valid_domain_box(box)),
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
-            res(i,j,0,0) = residual1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0),
-                                     acf(i,j,0), facx, facy);
-            res(i,j,0,1) = residual1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1),
-                                     acf(i,j,0), facx, facy);
+            res(i,j,0) = residual1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0),
+                                   acf(i,j,0), facx, facy);
+            res(i,j,1) = residual1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,1),
+                                   acf(i,j,0), facx, facy);
         });
     } else if (system_type == 2) {
         hpmg::ParallelFor(to2D(valid_domain_box(box)),
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
-            res(i,j,0,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0),
-                                      acf(i,j,0,0), acf(i,j,0,1), facx, facy);
-            res(i,j,0,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1),
-                                      acf(i,j,0,0), acf(i,j,0,1), facx, facy);
+            res(i,j,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0),
+                                    acf(i,j,0), acf(i,j,1), facx, facy);
+            res(i,j,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,1),
+                                    acf(i,j,0), acf(i,j,1), facx, facy);
         });
     } else {
         hpmg::ParallelFor(to2D(valid_domain_box(box)),
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
-            res(i,j,0,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), facx, facy);
+            res(i,j,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), facx, facy);
         });
     }
 }
@@ -271,31 +271,31 @@ void compute_residual (amrex::Box const& box, amrex::Array4<amrex::Real> const&
 template<bool is_cell_centered = true>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void gs1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
-          amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf, amrex::Real facx,
+          Array3<amrex::Real> const& phi, amrex::Real rhs, amrex::Real acf, amrex::Real facx,
           amrex::Real facy)
 {
     amrex::Real lap;
     amrex::Real c0 = -(acf+amrex::Real(2.)*(facx+facy));
     if (is_cell_centered && i == ilo) {
-        lap = facx * amrex::Real(4./3.)*phi(i+1,j,0,n);
+        lap = facx * amrex::Real(4./3.)*phi(i+1,j,n);
         c0 -= amrex::Real(2.)*facx;
     } else if (is_cell_centered && i == ihi) {
-        lap = facx * amrex::Real(4./3.)*phi(i-1,j,0,n);
+        lap = facx * amrex::Real(4./3.)*phi(i-1,j,n);
         c0 -= amrex::Real(2.)*facx;
     } else {
-        lap = facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n));
+        lap = facx * (phi(i-1,j,n) + phi(i+1,j,n));
     }
     if (is_cell_centered && j == jlo) {
-        lap += facy * amrex::Real(4./3.)*phi(i,j+1,0,n);
+        lap += facy * amrex::Real(4./3.)*phi(i,j+1,n);
         c0 -= amrex::Real(2.)*facy;
     } else if (is_cell_centered && j == jhi) {
-        lap += facy * amrex::Real(4./3.)*phi(i,j-1,0,n);
+        lap += facy * amrex::Real(4./3.)*phi(i,j-1,n);
         c0 -= amrex::Real(2.)*facy;
     } else {
-        lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n));
+        lap += facy * (phi(i,j-1,n) + phi(i,j+1,n));
     }
     const amrex::Real c0_inv = amrex::Real(1.) / c0;
-    phi(i,j,0,n) = (rhs - lap) * c0_inv;
+    phi(i,j,n) = (rhs - lap) * c0_inv;
 }
 
 // is_cell_centered = true: supports both cell centered and node centered solves
@@ -303,41 +303,41 @@ void gs1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
 template<bool is_cell_centered = true>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void gs2 (int i, int j, int ilo, int jlo, int ihi, int jhi,
-          amrex::Array4<amrex::Real> const& phi, amrex::Real rhs_r, amrex::Real rhs_i,
+          Array3<amrex::Real> const& phi, amrex::Real rhs_r, amrex::Real rhs_i,
           amrex::Real ar, amrex::Real ai, amrex::Real facx, amrex::Real facy)
 {
     amrex::Real lap[2];
     amrex::Real c0 = amrex::Real(-2.)*(facx+facy);
     if (is_cell_centered && i == ilo) {
-        lap[0] = facx * amrex::Real(4./3.)*phi(i+1,j,0,0);
-        lap[1] = facx * amrex::Real(4./3.)*phi(i+1,j,0,1);
+        lap[0] = facx * amrex::Real(4./3.)*phi(i+1,j,0);
+        lap[1] = facx * amrex::Real(4./3.)*phi(i+1,j,1);
         c0 -= amrex::Real(2.)*facx;
     } else if (is_cell_centered && i == ihi) {
-        lap[0] = facx * amrex::Real(4./3.)*phi(i-1,j,0,0);
-        lap[1] = facx * amrex::Real(4./3.)*phi(i-1,j,0,1);
+        lap[0] = facx * amrex::Real(4./3.)*phi(i-1,j,0);
+        lap[1] = facx * amrex::Real(4./3.)*phi(i-1,j,1);
         c0 -= amrex::Real(2.)*facx;
     } else {
-        lap[0] = facx * (phi(i-1,j,0,0) + phi(i+1,j,0,0));
-        lap[1] = facx * (phi(i-1,j,0,1) + phi(i+1,j,0,1));
+        lap[0] = facx * (phi(i-1,j,0) + phi(i+1,j,0));
+        lap[1] = facx * (phi(i-1,j,1) + phi(i+1,j,1));
     }
     if (is_cell_centered && j == jlo) {
-        lap[0] += facy * amrex::Real(4./3.)*phi(i,j+1,0,0);
-        lap[1] += facy * amrex::Real(4./3.)*phi(i,j+1,0,1);
+        lap[0] += facy * amrex::Real(4./3.)*phi(i,j+1,0);
+        lap[1] += facy * amrex::Real(4./3.)*phi(i,j+1,1);
         c0 -= amrex::Real(2.)*facy;
     } else if (is_cell_centered && j == jhi) {
-        lap[0] += facy * amrex::Real(4./3.)*phi(i,j-1,0,0);
-        lap[1] += facy * amrex::Real(4./3.)*phi(i,j-1,0,1);
+        lap[0] += facy * amrex::Real(4./3.)*phi(i,j-1,0);
+        lap[1] += facy * amrex::Real(4./3.)*phi(i,j-1,1);
         c0 -= amrex::Real(2.)*facy;
     } else {
-        lap[0] += facy * (phi(i,j-1,0,0) + phi(i,j+1,0,0));
-        lap[1] += facy * (phi(i,j-1,0,1) + phi(i,j+1,0,1));
+        lap[0] += facy * (phi(i,j-1,0) + phi(i,j+1,0));
+        lap[1] += facy * (phi(i,j-1,1) + phi(i,j+1,1));
     }
     amrex::Real c[2] = {c0-ar, -ai};
     amrex::Real cmag = amrex::Real(1.)/(c[0]*c[0] + c[1]*c[1]);
     c[0] *= cmag;
     c[1] *= cmag;
-    phi(i,j,0,0) = (rhs_r-lap[0])*c[0] + (rhs_i-lap[1])*c[1];
-    phi(i,j,0,1) = (rhs_i-lap[1])*c[0] - (rhs_r-lap[0])*c[1];
+    phi(i,j,0) = (rhs_r-lap[0])*c[0] + (rhs_i-lap[1])*c[1];
+    phi(i,j,1) = (rhs_i-lap[1])*c[0] - (rhs_r-lap[0])*c[1];
 }
 
 // is_cell_centered = true: supports both cell centered and node centered solves
@@ -345,35 +345,35 @@ void gs2 (int i, int j, int ilo, int jlo, int ihi, int jhi,
 template<bool is_cell_centered = true>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void gs3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
-          amrex::Array4<amrex::Real> const& phi, amrex::Real rhs, amrex::Real facx,
+          Array3<amrex::Real> const& phi, amrex::Real rhs, amrex::Real facx,
           amrex::Real facy)
 {
     amrex::Real lap;
     amrex::Real c0 = -amrex::Real(2.)*(facx+facy);
     if (is_cell_centered && i == ilo) {
-        lap = facx * amrex::Real(4./3.)*phi(i+1,j,0,n);
+        lap = facx * amrex::Real(4./3.)*phi(i+1,j,n);
         c0 -= amrex::Real(2.)*facx;
     } else if (is_cell_centered && i == ihi) {
-        lap = facx * amrex::Real(4./3.)*phi(i-1,j,0,n);
+        lap = facx * amrex::Real(4./3.)*phi(i-1,j,n);
         c0 -= amrex::Real(2.)*facx;
     } else {
-        lap = facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n));
+        lap = facx * (phi(i-1,j,n) + phi(i+1,j,n));
     }
     if (is_cell_centered && j == jlo) {
-        lap += facy * amrex::Real(4./3.)*phi(i,j+1,0,n);
+        lap += facy * amrex::Real(4./3.)*phi(i,j+1,n);
         c0 -= amrex::Real(2.)*facy;
     } else if (is_cell_centered && j == jhi) {
-        lap += facy * amrex::Real(4./3.)*phi(i,j-1,0,n);
+        lap += facy * amrex::Real(4./3.)*phi(i,j-1,n);
         c0 -= amrex::Real(2.)*facy;
     } else {
-        lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n));
+        lap += facy * (phi(i,j-1,n) + phi(i,j+1,n));
     }
     const amrex::Real c0_inv = amrex::Real(1.) / c0;
-    phi(i,j,0,n) = (rhs - lap) * c0_inv;
+    phi(i,j,n) = (rhs - lap) * c0_inv;
 }
 
-void gsrb (int icolor, amrex::Box const& box, amrex::Array4<amrex::Real> const& phi,
-           amrex::Array4<amrex::Real const> const& rhs, amrex::Array4<amrex::Real const> const& acf,
+void gsrb (int icolor, amrex::Box const& box, Array3<amrex::Real> const& phi,
+           Array3<amrex::Real const> const& rhs, Array3<amrex::Real const> const& acf,
            amrex::Real dx, amrex::Real dy, int system_type)
 {
     int const ilo = box.smallEnd(0);
@@ -387,8 +387,8 @@ void gsrb (int icolor, amrex::Box const& box, amrex::Array4<amrex::Real> const&
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
             if ((i+j+icolor)%2 == 0) {
-                gs1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), acf(i,j,0), facx, facy);
-                gs1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), acf(i,j,0), facx, facy);
+                gs1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), acf(i,j,0), facx, facy);
+                gs1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,1), acf(i,j,0), facx, facy);
             }
         });
     } else if (system_type == 2) {
@@ -396,8 +396,8 @@ void gsrb (int icolor, amrex::Box const& box, amrex::Array4<amrex::Real> const&
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
             if ((i+j+icolor)%2 == 0) {
-                gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), rhs(i,j,0,1),
-                    acf(i,j,0,0), acf(i,j,0,1), facx, facy);
+                gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), rhs(i,j,1),
+                    acf(i,j,0), acf(i,j,1), facx, facy);
             }
         });
     } else {
@@ -405,7 +405,7 @@ void gsrb (int icolor, amrex::Box const& box, amrex::Array4<amrex::Real> const&
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
             if ((i+j+icolor)%2 == 0) {
-                gs3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), facx, facy);
+                gs3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), facx, facy);
             }
         });
     }
@@ -419,10 +419,10 @@ void gsrb (int icolor, amrex::Box const& box, amrex::Array4<amrex::Real> const&
 
 // do multiple gsrb iterations in GPU shared memory with many ghost cells
 template<int system_type, bool zero_init, bool do_compute_residual, bool is_cell_centered>
-void gsrb_shared (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_out,
-                  amrex::Array4<amrex::Real const> const& rhs,
-                  amrex::Array4<amrex::Real const> const& acf, amrex::Array4<amrex::Real> const& res,
-                  amrex::Array4<amrex::Real const> const& phi_in, amrex::Real dx, amrex::Real dy)
+void gsrb_shared (amrex::Box const& box, Array3<amrex::Real> const& phi_out,
+                  Array3<amrex::Real const> const& rhs,
+                  Array3<amrex::Real const> const& acf, Array3<amrex::Real> const& res,
+                  Array3<amrex::Real const> const& phi_in, amrex::Real dx, amrex::Real dy)
 {
     constexpr int num_comps = MultiGrid::get_num_comps(system_type);
     constexpr int num_comps_acf = MultiGrid::get_num_comps_acf(system_type);
@@ -473,9 +473,9 @@ void gsrb_shared (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_o
             const int tile_end_x = tile_begin_x + tilesize_array_x;
             const int tile_end_y = tile_begin_y + tilesize_array_y;
 
-            // make amrex::Array4 reference shared memory tile
-            amrex::Array4<amrex::Real> phi_shared(phi_ptr, {tile_begin_x, tile_begin_y, 0},
-                                             {tile_end_x, tile_end_y, 1}, num_comps);
+            // make Array3 reference shared memory tile
+            Array3<amrex::Real> phi_shared({phi_ptr, {tile_begin_x, tile_begin_y, 0},
+                                                     {tile_end_x, tile_end_y, 1}, num_comps});
 
             if (zero_init) {
                 // initialize shared memory to zero
@@ -492,11 +492,11 @@ void gsrb_shared (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_o
                     if (ilo_loop <= sx && sx <= ihi_loop &&
                         jlo_loop <= sy && sy <= jhi_loop) {
                         for (int n=0; n<num_comps; ++n) {
-                            phi_shared(sx, sy, 0, n) = phi_in(sx, sy, 0, n);
+                            phi_shared(sx, sy, n) = phi_in(sx, sy, n);
                         }
                     } else {
                         for (int n=0; n<num_comps; ++n) {
-                            phi_shared(sx, sy, 0, n) = amrex::Real(0.);
+                            phi_shared(sx, sy, n) = amrex::Real(0.);
                         }
                     }
                 }
@@ -517,10 +517,10 @@ void gsrb_shared (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_o
                     jlo_loop <= j+nj && j+nj <= jhi_loop) {
                     // load rhs and acf into registers to avoid memory accesses in the gs iterations
                     for (int n=0; n<num_comps; ++n) {
-                        rhs_num[nj][n] = rhs(i, j+nj, 0, n);
+                        rhs_num[nj][n] = rhs(i, j+nj, n);
                     }
                     for (int n=0; n<num_comps_acf; ++n) {
-                        acf_num[nj][n] = acf(i, j+nj, 0, n);
+                        acf_num[nj][n] = acf(i, j+nj, n);
                     }
                 }
             }
@@ -568,30 +568,30 @@ void gsrb_shared (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_o
                         // fuse with compute_residual kernel and reuse phi_shared
                         // at the cost of one extra ghost cell in each direction
                         if (system_type == 1) {
-                            res(i, j+nj, 0, 0) = residual1(
-                                                    i, j+nj, 0, ilo, jlo, ihi, jhi, phi_shared,
-                                                    rhs_num[nj][0], acf_num[nj][0], facx, facy);
-                            res(i, j+nj, 0, 1) = residual1(
-                                                    i, j+nj, 1, ilo, jlo, ihi, jhi, phi_shared,
-                                                    rhs_num[nj][1], acf_num[nj][0], facx, facy);
+                            res(i, j+nj, 0) = residual1(
+                                                i, j+nj, 0, ilo, jlo, ihi, jhi, phi_shared,
+                                                rhs_num[nj][0], acf_num[nj][0], facx, facy);
+                            res(i, j+nj, 1) = residual1(
+                                                i, j+nj, 1, ilo, jlo, ihi, jhi, phi_shared,
+                                                rhs_num[nj][1], acf_num[nj][0], facx, facy);
                         } else if (system_type == 2) {
-                            res(i, j+nj, 0, 0) = residual2r(
-                                                    i, j+nj, ilo, jlo, ihi, jhi, phi_shared,
-                                                    rhs_num[nj][0], acf_num[nj][0], acf_num[nj][1],
-                                                    facx, facy);
-                            res(i, j+nj, 0, 1) = residual2i(
-                                                    i, j+nj, ilo, jlo, ihi, jhi, phi_shared,
-                                                    rhs_num[nj][1], acf_num[nj][0], acf_num[nj][1],
-                                                    facx, facy);
+                            res(i, j+nj, 0) = residual2r(
+                                                i, j+nj, ilo, jlo, ihi, jhi, phi_shared,
+                                                rhs_num[nj][0], acf_num[nj][0], acf_num[nj][1],
+                                                facx, facy);
+                            res(i, j+nj, 1) = residual2i(
+                                                i, j+nj, ilo, jlo, ihi, jhi, phi_shared,
+                                                rhs_num[nj][1], acf_num[nj][0], acf_num[nj][1],
+                                                facx, facy);
                         } else if (system_type == 3) {
-                            res(i, j+nj, 0, 0) = residual3(
-                                                    i, j+nj, 0, ilo, jlo, ihi, jhi, phi_shared,
-                                                    rhs_num[nj][0], facx, facy);
+                            res(i, j+nj, 0) = residual3(
+                                                i, j+nj, 0, ilo, jlo, ihi, jhi, phi_shared,
+                                                rhs_num[nj][0], facx, facy);
                         }
                     }
 
                     for (int n=0; n<num_comps; ++n) {
-                        phi_out(i, j+nj, 0, n) = phi_shared(i, j+nj, 0, n);
+                        phi_out(i, j+nj, n) = phi_shared(i, j+nj, n);
                     }
                 }
             }
@@ -602,11 +602,11 @@ void gsrb_shared (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_o
 
 // do multiple gsrb iterations in CPU cached memory with many ghost cells
 template<int system_type, bool zero_init, bool do_compute_residual, bool is_cell_centered>
-void gsrb_cached (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_out,
-                  amrex::Array4<amrex::Real const> const& rhs,
-                  amrex::Array4<amrex::Real const> const& acf,
-                  amrex::Array4<amrex::Real> const& res,
-                  amrex::Array4<amrex::Real const> const& phi_in, amrex::Real dx, amrex::Real dy)
+void gsrb_cached (amrex::Box const& box, Array3<amrex::Real> const& phi_out,
+                  Array3<amrex::Real const> const& rhs,
+                  Array3<amrex::Real const> const& acf,
+                  Array3<amrex::Real> const& res,
+                  Array3<amrex::Real const> const& phi_in, amrex::Real dx, amrex::Real dy)
 {
     constexpr int num_comps = MultiGrid::get_num_comps(system_type);
     constexpr int tilesize_x = 64;
@@ -652,9 +652,9 @@ void gsrb_cached (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_o
             const int tile_end_x = tile_begin_x + tilesize_array_x;
             const int tile_end_y = tile_begin_y + tilesize_array_y;
 
-            // make amrex::Array4 reference cached memory tile
-            amrex::Array4<amrex::Real> phi_cached(phi_ptr, {tile_begin_x, tile_begin_y, 0},
-                                             {tile_end_x, tile_end_y, 1}, num_comps);
+            // make Array3 reference cached memory tile
+            Array3<amrex::Real> phi_cached({phi_ptr, {tile_begin_x, tile_begin_y, 0},
+                                                     {tile_end_x, tile_end_y, 1}, num_comps});
 
             if  (zero_init) {
                 // initialize cached memory to zero
@@ -668,11 +668,11 @@ void gsrb_cached (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_o
                         if (ilo_loop <= i && i <= ihi_loop &&
                             jlo_loop <= j && j <= jhi_loop) {
                             for (int n=0; n<num_comps; ++n) {
-                                phi_cached(i, j, 0, n) = phi_in(i, j, 0, n);
+                                phi_cached(i, j, n) = phi_in(i, j, n);
                             }
                         } else {
                             for (int n=0; n<num_comps; ++n) {
-                                phi_cached(i, j, 0, n) = amrex::Real(0.);
+                                phi_cached(i, j, n) = amrex::Real(0.);
                             }
                         }
                     }
@@ -691,16 +691,16 @@ void gsrb_cached (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_o
                     for (int i = i_start + shift; i < i_end; i+=2) {
                         if (system_type == 1) {
                             gs1<is_cell_centered>(i, j, 0, ilo, jlo, ihi, jhi, phi_cached,
-                                rhs(i, j, 0, 0), acf(i, j, 0, 0), facx, facy);
+                                rhs(i, j, 0), acf(i, j, 0), facx, facy);
                             gs1<is_cell_centered>(i, j, 1, ilo, jlo, ihi, jhi, phi_cached,
-                                rhs(i, j, 0, 1), acf(i, j, 0, 0), facx, facy);
+                                rhs(i, j, 1), acf(i, j, 0), facx, facy);
                         } else if (system_type == 2) {
                             gs2<is_cell_centered>(i, j, ilo, jlo, ihi, jhi, phi_cached,
-                                rhs(i, j, 0, 0), rhs(i, j, 0, 1),
-                                acf(i, j, 0, 0), acf(i, j, 0, 1), facx, facy);
+                                rhs(i, j, 0), rhs(i, j, 1),
+                                acf(i, j, 0), acf(i, j, 1), facx, facy);
                         } else if (system_type == 3) {
                             gs3<is_cell_centered>(i, j, 0, ilo, jlo, ihi, jhi, phi_cached,
-                                rhs(i, j, 0, 0), facx, facy);
+                                rhs(i, j, 0), facx, facy);
                         }
                     }
                 }
@@ -719,30 +719,30 @@ void gsrb_cached (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_o
                         // fuse with compute_residual loop and reuse phi_cached
                         // at the cost of one extra ghost cell in each direction
                         if (system_type == 1) {
-                            res(i, j, 0, 0) = residual1(
+                            res(i, j, 0) = residual1(
                                 i, j, 0, ilo, jlo, ihi, jhi, phi_cached,
-                                rhs(i, j, 0, 0), acf(i, j, 0, 0), facx, facy);
-                            res(i, j, 0, 1) = residual1(
+                                rhs(i, j, 0), acf(i, j, 0), facx, facy);
+                            res(i, j, 1) = residual1(
                                 i, j, 1, ilo, jlo, ihi, jhi, phi_cached,
-                                rhs(i, j, 0, 1), acf(i, j, 0, 0), facx, facy);
+                                rhs(i, j, 1), acf(i, j, 0), facx, facy);
                         } else if (system_type == 2) {
-                            res(i, j, 0, 0) = residual2r(
+                            res(i, j, 0) = residual2r(
                                 i, j, ilo, jlo, ihi, jhi, phi_cached,
-                                rhs(i, j, 0, 0), acf(i, j, 0, 0),
-                                acf(i, j, 0, 1), facx, facy);
-                            res(i, j, 0, 1) = residual2i(
+                                rhs(i, j, 0), acf(i, j, 0),
+                                acf(i, j, 1), facx, facy);
+                            res(i, j, 1) = residual2i(
                                 i, j, ilo, jlo, ihi, jhi, phi_cached,
-                                rhs(i, j, 0, 1), acf(i, j, 0, 0),
-                                acf(i, j, 0, 1), facx, facy);
+                                rhs(i, j, 1), acf(i, j, 0),
+                                acf(i, j, 1), facx, facy);
                         } else if (system_type == 3) {
-                            res(i, j, 0, 0) = residual3(
+                            res(i, j, 0) = residual3(
                                 i, j, 0, ilo, jlo, ihi, jhi, phi_cached,
-                                rhs(i, j, 0, 0), facx, facy);
+                                rhs(i, j, 0), facx, facy);
                         }
                     }
 
                     for (int n=0; n<num_comps; ++n) {
-                        phi_out(i, j, 0, n) = phi_cached(i, j, 0, n);
+                        phi_out(i, j, n) = phi_cached(i, j, n);
                     }
                 }
             }
@@ -754,11 +754,11 @@ void gsrb_cached (amrex::Box const& box, amrex::Array4<amrex::Real> const& phi_o
 
 template<bool zero_init, bool do_compute_residual>
 void gsrb_4_residual (int system_type, amrex::Box const& box,
-                      amrex::Array4<amrex::Real> const& phi_out,
-                      amrex::Array4<amrex::Real const> const& rhs,
-                      amrex::Array4<amrex::Real const> const& acf,
-                      amrex::Array4<amrex::Real> const& res,
-                      amrex::Array4<amrex::Real const> const& phi_in,
+                      Array3<amrex::Real> const& phi_out,
+                      Array3<amrex::Real const> const& rhs,
+                      Array3<amrex::Real const> const& acf,
+                      Array3<amrex::Real> const& res,
+                      Array3<amrex::Real const> const& phi_in,
                       amrex::Real dx, amrex::Real dy)
 {
     // This function performs the main computation for the multigrid solver.
@@ -833,9 +833,9 @@ void gsrb_4_residual (int system_type, amrex::Box const& box,
         hpmg::ParallelFor(to2D(box), MultiGrid::get_num_comps(system_type),
             [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept {
                 if (valid_domain.contains(i,j,0)) {
-                    phi_out(i,j,0,n) = phi_in(i,j,0,n);
+                    phi_out(i,j,n) = phi_in(i,j,n);
                 } else {
-                    phi_out(i,j,0,n) = amrex::Real(0.);
+                    phi_out(i,j,n) = amrex::Real(0.);
                 }
             });
     }
@@ -863,9 +863,10 @@ void gsrb_4_residual (int system_type, amrex::Box const& box,
 #endif
 
 template <int NS, int system_type, typename FGS, typename FRES>
-void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4<amrex::Real> const* acf,
-                      amrex::Array4<amrex::Real> const* res, amrex::Array4<amrex::Real> const* cor,
-                      amrex::Array4<amrex::Real> const* rescor, int nlevs, int corner_offset,
+void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, Array3<amrex::Real> const* acf,
+                      Array3<amrex::Real> const* res, Array3<amrex::Real> const* cor,
+                      Array3<amrex::Real> const* rescor, amrex::BoxND<2> const* lev_domain,
+                      int nlevs, int corner_offset,
                       FGS&& fgs, FRES&& fres)
 {
     // This function performs all the operations of a vcycle in a single GPU kernel by
@@ -876,47 +877,50 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4<amrex::Rea
 
     static_assert(n_cell_single*n_cell_single <= 1024, "n_cell_single is too big");
 #if defined(AMREX_USE_DPCPP)
-    amrex::launch(1, 1024, amrex::Gpu::gpuStream(),
+    amrex::launch(system_type == 1 ? 2 : 1, 1024, amrex::Gpu::gpuStream(),
     [=] (sycl::nd_item<1> const& item) noexcept
 #else
-    amrex::launch_global<1024><<<1, 1024, 0, amrex::Gpu::gpuStream()>>>(
+    amrex::launch_global<1024><<<system_type == 1 ? 2 : 1, 1024, 0, amrex::Gpu::gpuStream()>>>(
     [=] AMREX_GPU_DEVICE () noexcept
 #endif
     {
         amrex::Real facx = amrex::Real(1.)/(dx0*dx0);
         amrex::Real facy = amrex::Real(1.)/(dy0*dy0);
-        int lenx = cor[0].end.x - cor[0].begin.x - 2*corner_offset;
-        int leny = cor[0].end.y - cor[0].begin.y - 2*corner_offset;
-        int ncells = lenx*leny;
+        const int lenx = lev_domain[0].length(0) - 2*corner_offset;
 #if defined(AMREX_USE_DPCPP)
         const int icell = item.get_local_linear_id();
+        const int n = system_type == 1 ? item.get_group_linear_id() : 0;
 #else
         const int icell = threadIdx.x;
+        const int n = system_type == 1 ? blockIdx.x : 0;
 #endif
         int j = icell /   lenx;
         int i = icell - j*lenx;
-        j += cor[0].begin.y + corner_offset;
-        i += cor[0].begin.x + corner_offset;
+        j += lev_domain[0].smallEnd(1) + corner_offset;
+        i += lev_domain[0].smallEnd(0) + corner_offset;
+        bool is_active = true;
 
         for (int ilev = 0; ilev < nlevs-1; ++ilev) {
             // set phi to zero
-            if (icell < ncells) {
-                if (system_type == 1 || system_type == 2) {
-                    cor[ilev](i,j,0,0) = amrex::Real(0.);
-                    cor[ilev](i,j,0,1) = amrex::Real(0.);
+            if (is_active) {
+                if (system_type == 1) {
+                    cor[ilev](i,j,n) = amrex::Real(0.);
+                } else if (system_type == 2) {
+                    cor[ilev](i,j,0) = amrex::Real(0.);
+                    cor[ilev](i,j,1) = amrex::Real(0.);
                 } else {
-                    cor[ilev](i,j,0,0) = amrex::Real(0.);
+                    cor[ilev](i,j,0) = amrex::Real(0.);
                 }
             }
             HPMG_SYNCTHREADS;
 
             // do 4 Gauss-Seidel red-black iterations
             for (int is = 0; is < 4; ++is) {
-                if (icell < ncells) {
+                if (is_active) {
                     if ((i+j+is)%2 == 0) {
-                        fgs(i, j,
-                            cor[ilev].begin.x, cor[ilev].begin.y,
-                            cor[ilev].end.x-1, cor[ilev].end.y-1,
+                        fgs(i, j, n,
+                            lev_domain[ilev].smallEnd(0), lev_domain[ilev].smallEnd(1),
+                            lev_domain[ilev].bigEnd(0), lev_domain[ilev].bigEnd(1),
                             cor[ilev], res[ilev], acf[ilev],
                             facx, facy);
                     }
@@ -925,11 +929,11 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4<amrex::Rea
             }
 
             // calculate the residual
-            if (icell < ncells) {
-                fres(i, j,
+            if (is_active) {
+                fres(i, j, n,
                      rescor[ilev],
-                     cor[ilev].begin.x, cor[ilev].begin.y,
-                     cor[ilev].end.x-1, cor[ilev].end.y-1,
+                     lev_domain[ilev].smallEnd(0), lev_domain[ilev].smallEnd(1),
+                     lev_domain[ilev].bigEnd(0), lev_domain[ilev].bigEnd(1),
                      cor[ilev],
                      res[ilev],
                      acf[ilev], facx, facy);
@@ -937,25 +941,27 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4<amrex::Rea
             HPMG_SYNCTHREADS;
 
             // interpolate residual to next level
-            lenx = cor[ilev+1].end.x - cor[ilev+1].begin.x - 2*corner_offset;
-            leny = cor[ilev+1].end.y - cor[ilev+1].begin.y - 2*corner_offset;
-            ncells = lenx*leny;
-            if (icell < ncells) {
-                j = icell /   lenx;
-                i = icell - j*lenx;
-                j += cor[ilev+1].begin.y + corner_offset;
-                i += cor[ilev+1].begin.x + corner_offset;
+            is_active =
+                i >= (lev_domain[ilev+1].smallEnd(0) + corner_offset) &&
+                i <= (lev_domain[ilev+1].bigEnd(0) - corner_offset) &&
+                j >= (lev_domain[ilev+1].smallEnd(1) + corner_offset) &&
+                j <= (lev_domain[ilev+1].bigEnd(1) - corner_offset);
+            if (is_active) {
                 if (corner_offset == 0) {
-                    if (system_type == 1 || system_type == 2) {
+                    if (system_type == 1) {
+                        restrict_cc(i,j,n,res[ilev+1],rescor[ilev]);
+                    } else if (system_type == 2) {
                         restrict_cc(i,j,0,res[ilev+1],rescor[ilev]);
                         restrict_cc(i,j,1,res[ilev+1],rescor[ilev]);
                     } else {
                         restrict_cc(i,j,0,res[ilev+1],rescor[ilev]);
                     }
                 } else {
-                    if (system_type == 1 || system_type == 2) {
-                        restrict_nd(i,j,0,res[ilev+1],rescor[ilev]);
-                        restrict_nd(i,j,1,res[ilev+1],rescor[ilev]);
+                    if (system_type == 1) {
+                        restrict_nd(i,j,n,res[ilev+1],rescor[ilev]);
+                    } else if (system_type == 2) {
+                        restrict_cc(i,j,0,res[ilev+1],rescor[ilev]);
+                        restrict_cc(i,j,1,res[ilev+1],rescor[ilev]);
                     } else {
                         restrict_nd(i,j,0,res[ilev+1],rescor[ilev]);
                     }
@@ -971,23 +977,25 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4<amrex::Rea
         {
             // set phi to zero
             const int ilev = nlevs-1;
-            if (icell < ncells) {
-                if (system_type == 1 || system_type == 2) {
-                    cor[ilev](i,j,0,0) = amrex::Real(0.);
-                    cor[ilev](i,j,0,1) = amrex::Real(0.);
+            if (is_active) {
+                if (system_type == 1) {
+                    cor[ilev](i,j,n) = amrex::Real(0.);
+                } else if (system_type == 2) {
+                    cor[ilev](i,j,0) = amrex::Real(0.);
+                    cor[ilev](i,j,1) = amrex::Real(0.);
                 } else {
-                    cor[ilev](i,j,0,0) = amrex::Real(0.);
+                    cor[ilev](i,j,0) = amrex::Real(0.);
                 }
             }
             HPMG_SYNCTHREADS;
 
             // do NS Gauss-Seidel red-black iterations
             for (int is = 0; is < NS; ++is) {
-                if (icell < ncells) {
+                if (is_active) {
                     if ((i+j+is)%2 == 0) {
-                        fgs(i, j,
-                            cor[ilev].begin.x, cor[ilev].begin.y,
-                            cor[ilev].end.x-1, cor[ilev].end.y-1,
+                        fgs(i, j, n,
+                            lev_domain[ilev].smallEnd(0), lev_domain[ilev].smallEnd(1),
+                            lev_domain[ilev].bigEnd(0), lev_domain[ilev].bigEnd(1),
                             cor[ilev], res[ilev], acf[ilev],
                             facx, facy);
                     }
@@ -998,26 +1006,30 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4<amrex::Rea
 
         // interpolate solution from previous level to phi
         for (int ilev = nlevs-2; ilev >=0; --ilev) {
-            lenx = cor[ilev].end.x - cor[ilev].begin.x - 2*corner_offset;
-            leny = cor[ilev].end.y - cor[ilev].begin.y - 2*corner_offset;
-            ncells = lenx*leny;
+
+            is_active =
+                i >= (lev_domain[ilev].smallEnd(0) + corner_offset) &&
+                i <= (lev_domain[ilev].bigEnd(0) - corner_offset) &&
+                j >= (lev_domain[ilev].smallEnd(1) + corner_offset) &&
+                j <= (lev_domain[ilev].bigEnd(1) - corner_offset);
+
             facx *= amrex::Real(4.);
             facy *= amrex::Real(4.);
 
-            if (icell < ncells) {
-                j = icell /   lenx;
-                i = icell - j*lenx;
-                j += cor[ilev].begin.y + corner_offset;
-                i += cor[ilev].begin.x + corner_offset;
+            if (is_active) {
                 if (corner_offset == 0) {
-                    if (system_type == 1 || system_type == 2) {
+                    if (system_type == 1) {
+                        interpadd_cc(i, j, n, cor[ilev], cor[ilev+1]);
+                    } else if (system_type == 2) {
                         interpadd_cc(i, j, 0, cor[ilev], cor[ilev+1]);
                         interpadd_cc(i, j, 1, cor[ilev], cor[ilev+1]);
                     } else {
                         interpadd_cc(i, j, 0, cor[ilev], cor[ilev+1]);
                     }
                 } else {
-                    if (system_type == 1 || system_type == 2) {
+                    if (system_type == 1) {
+                        interpadd_nd(i, j, n, cor[ilev], cor[ilev+1]);
+                    } else if (system_type == 2) {
                         interpadd_nd(i, j, 0, cor[ilev], cor[ilev+1]);
                         interpadd_nd(i, j, 1, cor[ilev], cor[ilev+1]);
                     } else {
@@ -1029,11 +1041,11 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4<amrex::Rea
             // do 4 Gauss-Seidel red-black iterations
             for (int is = 0; is < 4; ++is) {
                 HPMG_SYNCTHREADS;
-                if (icell < ncells) {
+                if (is_active) {
                     if ((i+j+is)%2 == 0) {
-                        fgs(i, j,
-                            cor[ilev].begin.x, cor[ilev].begin.y,
-                            cor[ilev].end.x-1, cor[ilev].end.y-1,
+                        fgs(i, j, n,
+                            lev_domain[ilev].smallEnd(0), lev_domain[ilev].smallEnd(1),
+                            lev_domain[ilev].bigEnd(0), lev_domain[ilev].bigEnd(1),
                             cor[ilev], res[ilev], acf[ilev],
                             facx, facy);
                     }
@@ -1110,7 +1122,7 @@ MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a
     m_num_single_block_levels = m_num_mg_levels - m_single_block_level_begin;
 
     if (m_num_single_block_levels > 0) {
-        m_array4.reserve(nfabvs*m_num_single_block_levels);
+        m_array3.reserve(nfabvs*m_num_single_block_levels);
     }
 
     // Allocate the proportional coefficient of the Helmholtz equation.
@@ -1119,7 +1131,7 @@ MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a
     for (int ilev = 0; ilev < m_num_mg_levels; ++ilev) {
         m_acf.emplace_back(m_domain[ilev], m_num_comps_acf);
         if (ilev >= m_single_block_level_begin) {
-            m_array4.push_back(m_acf[ilev].array());
+            m_array3.push_back(m_acf[ilev].array());
         }
     }
 
@@ -1135,7 +1147,7 @@ MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a
                 m_res[ilev].template setVal<amrex::RunOn::Device>(0);
             }
             if (ilev >= m_single_block_level_begin) {
-                m_array4.push_back(m_res[ilev].array());
+                m_array3.push_back(m_res[ilev].array());
             }
         }
     }
@@ -1148,7 +1160,7 @@ MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a
             m_cor[ilev].template setVal<amrex::RunOn::Device>(0);
         }
         if (ilev >= m_single_block_level_begin) {
-            m_array4.push_back(m_cor[ilev].array());
+            m_array3.push_back(m_cor[ilev].array());
         }
     }
 
@@ -1160,17 +1172,24 @@ MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a
             m_rescor[ilev].template setVal<amrex::RunOn::Device>(0);
         }
         if (ilev >= m_single_block_level_begin) {
-            m_array4.push_back(m_rescor[ilev].array());
+            m_array3.push_back(m_rescor[ilev].array());
         }
     }
 
-    // Copy amrex::Array4s to GPU for bottomsolve_gpu()
-    if (!m_array4.empty()) {
-        m_array4.copyToDeviceAsync();
-        m_acf_a = m_array4.data();
+    // Copy domain to GPU
+    m_lev_domain.reserve(m_num_single_block_levels);
+    for (int ilev = m_single_block_level_begin; ilev < m_num_mg_levels; ++ilev) {
+        m_lev_domain.push_back(to2D(m_domain[ilev]));
+    }
+
+    // Copy Array3s to GPU for bottomsolve_gpu()
+    if (!m_array3.empty()) {
+        m_array3.copyToDeviceAsync();
+        m_acf_a = m_array3.data();
         m_res_a = m_acf_a + m_num_single_block_levels;
         m_cor_a = m_res_a + m_num_single_block_levels;
         m_rescor_a = m_cor_a + m_num_single_block_levels;
+        m_lev_domain.copyToDeviceAsync();
     }
 }
 
@@ -1188,8 +1207,8 @@ MultiGrid::solve1 (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs,
 
     amrex::FArrayBox afab(center_box(a_acf.box(), m_domain.front()), 1, a_acf.dataPtr());
 
-    auto const& array_m_acf = m_acf[0].array();
-    auto const& array_a_acf = afab.const_array();
+    const Array3<amrex::Real>& array_m_acf = m_acf[0].array();
+    const Array3<const amrex::Real>& array_a_acf = afab.const_array();
     hpmg::ParallelFor(to2D(m_acf[0].box()),
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
@@ -1210,13 +1229,13 @@ MultiGrid::solve2 (amrex::FArrayBox& sol, amrex::FArrayBox const& rhs,
     HIPACE_PROFILE("hpmg::MultiGrid::solve2()");
     AMREX_ALWAYS_ASSERT(m_system_type == 2);
 
-    auto const& array_m_acf = m_acf[0].array();
+    const Array3<amrex::Real>& array_m_acf = m_acf[0].array();
 
     hpmg::ParallelFor(to2D(m_acf[0].box()),
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
-            array_m_acf(i,j,0,0) = acoef_real;
-            array_m_acf(i,j,0,1) = acoef_imag;
+            array_m_acf(i,j,0) = acoef_real;
+            array_m_acf(i,j,1) = acoef_imag;
         });
 
     average_down_acoef();
@@ -1233,15 +1252,15 @@ MultiGrid::solve2 (amrex::FArrayBox& sol, amrex::FArrayBox const& rhs,
     HIPACE_PROFILE("hpmg::MultiGrid::solve2()");
     AMREX_ALWAYS_ASSERT(m_system_type == 2);
 
-    auto const& array_m_acf = m_acf[0].array();
+    const Array3<amrex::Real>& array_m_acf = m_acf[0].array();
 
     amrex::FArrayBox ifab(center_box(acoef_imag.box(), m_domain.front()), 1, acoef_imag.dataPtr());
-    auto const& ai = ifab.const_array();
+    const Array3<const amrex::Real>& ai = ifab.const_array();
     hpmg::ParallelFor(to2D(m_acf[0].box()),
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
-            array_m_acf(i,j,0,0) = acoef_real;
-            array_m_acf(i,j,0,1) = ai(i,j,0);
+            array_m_acf(i,j,0) = acoef_real;
+            array_m_acf(i,j,1) = ai(i,j,0);
         });
 
     average_down_acoef();
@@ -1258,15 +1277,15 @@ MultiGrid::solve2 (amrex::FArrayBox& sol, amrex::FArrayBox const& rhs,
     HIPACE_PROFILE("hpmg::MultiGrid::solve2()");
     AMREX_ALWAYS_ASSERT(m_system_type == 2);
 
-    auto const& array_m_acf = m_acf[0].array();
+    const Array3<amrex::Real>& array_m_acf = m_acf[0].array();
 
     amrex::FArrayBox rfab(center_box(acoef_real.box(), m_domain.front()), 1, acoef_real.dataPtr());
-    auto const& ar = rfab.const_array();
+    const Array3<const amrex::Real>& ar = rfab.const_array();
     hpmg::ParallelFor(to2D(m_acf[0].box()),
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
-            array_m_acf(i,j,0,0) = ar(i,j,0);
-            array_m_acf(i,j,0,1) = acoef_imag;
+            array_m_acf(i,j,0) = ar(i,j,0);
+            array_m_acf(i,j,1) = acoef_imag;
         });
 
     average_down_acoef();
@@ -1283,17 +1302,17 @@ MultiGrid::solve2 (amrex::FArrayBox& sol, amrex::FArrayBox const& rhs,
     HIPACE_PROFILE("hpmg::MultiGrid::solve2()");
     AMREX_ALWAYS_ASSERT(m_system_type == 2);
 
-    auto const& array_m_acf = m_acf[0].array();
+    const Array3<amrex::Real>& array_m_acf = m_acf[0].array();
 
     amrex::FArrayBox rfab(center_box(acoef_real.box(), m_domain.front()), 1, acoef_real.dataPtr());
     amrex::FArrayBox ifab(center_box(acoef_imag.box(), m_domain.front()), 1, acoef_imag.dataPtr());
-    auto const& ar = rfab.const_array();
-    auto const& ai = ifab.const_array();
+    const Array3<const amrex::Real>& ar = rfab.const_array();
+    const Array3<const amrex::Real>& ai = ifab.const_array();
     hpmg::ParallelFor(to2D(m_acf[0].box()),
         [=] AMREX_GPU_DEVICE (int i, int j) noexcept
         {
-            array_m_acf(i,j,0,0) = ar(i,j,0);
-            array_m_acf(i,j,0,1) = ai(i,j,0);
+            array_m_acf(i,j,0) = ar(i,j,0);
+            array_m_acf(i,j,1) = ai(i,j,0);
         });
 
     average_down_acoef();
@@ -1338,7 +1357,7 @@ MultiGrid::solve_doit (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs,
     // cor = gsrb(gsrb(gsrb(gsrb(sol))))
     // rescor = rhs - L(cor)
     gsrb_4_residual<false, true>(m_system_type, m_domain[0], m_cor[0].array(), m_rhs.const_array(),
-        m_acf[0].const_array(), m_rescor[0].array(), m_sol.array(), m_dx, m_dy);
+        m_acf[0].const_array(), m_rescor[0].array(), m_sol.const_array(), m_dx, m_dy);
 
     // Reduction on the right-hand side and residual to get convergence criteria
     amrex::Real resnorm0, rhsnorm0;
@@ -1346,12 +1365,12 @@ MultiGrid::solve_doit (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs,
         amrex::ReduceOps<amrex::ReduceOpMax,amrex::ReduceOpMax> reduce_op;
         amrex::ReduceData<amrex::Real,amrex::Real> reduce_data(reduce_op);
         using ReduceTuple = typename decltype(reduce_data)::Type;
-        const auto& array_res = m_rescor[0].const_array();
-        const auto& array_rhs = m_rhs.const_array();
+        const Array3<const amrex::Real>& array_res = m_rescor[0].const_array();
+        const Array3<const amrex::Real>& array_rhs = m_rhs.const_array();
         reduce_op.eval(valid_domain_box(m_domain[0]), m_num_comps, reduce_data,
             [=] AMREX_GPU_DEVICE (int i, int j, int, int n) noexcept -> ReduceTuple
             {
-                return {std::abs(array_res(i,j,0,n)), std::abs(array_rhs(i,j,0,n))};
+                return {std::abs(array_res(i,j,n)), std::abs(array_rhs(i,j,n))};
             });
 
         auto hv = reduce_data.value(reduce_op);
@@ -1427,12 +1446,12 @@ MultiGrid::solve_doit (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs,
     }
 
     // copy the cor array into the sol array used for the final output
-    auto const& sol = m_sol.array();
-    auto const& cor = m_cor[0].const_array();
+    const Array3<amrex::Real>& sol = m_sol.array();
+    const Array3<const amrex::Real>& cor = m_cor[0].const_array();
     hpmg::ParallelFor(to2D(valid_domain_box(m_domain[0])), m_num_comps,
     [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept
     {
-        sol(i,j,0,n) = cor(i,j,0,n);
+        sol(i,j,n) = cor(i,j,n);
     });
 }
 
@@ -1466,7 +1485,8 @@ MultiGrid::vcycle ()
             // rescor = res - L(cor)
             gsrb_4_residual<true, true>(
                 m_system_type, m_domain[ilev], m_cor[ilev].array(), m_res[ilev].const_array(),
-                m_acf[ilev].const_array(), m_rescor[ilev].array(), {}, dx, dy);
+                m_acf[ilev].const_array(), m_rescor[ilev].array(),
+                amrex::Array4<const amrex::Real>{}, dx, dy);
         }
 
         // interpolate residual to next level
@@ -1495,12 +1515,14 @@ MultiGrid::vcycle ()
             // sol = gsrb(gsrb(gsrb(gsrb(rescor))))
             gsrb_4_residual<false, false>(
                     m_system_type, m_domain[ilev], m_sol.array(), m_rhs.const_array(),
-                    m_acf[ilev].const_array(), {}, m_rescor[ilev].array(), dx, dy);
+                    m_acf[ilev].const_array(), amrex::Array4<amrex::Real>{},
+                    m_rescor[ilev].const_array(), dx, dy);
         } else {
             // cor = gsrb(gsrb(gsrb(gsrb(rescor))))
             gsrb_4_residual<false, false>(
                     m_system_type, m_domain[ilev], m_cor[ilev].array(), m_res[ilev].const_array(),
-                    m_acf[ilev].const_array(), {}, m_rescor[ilev].array(), dx, dy);
+                    m_acf[ilev].const_array(), amrex::Array4<amrex::Real>{},
+                    m_rescor[ilev].const_array(), dx, dy);
         }
     }
 
@@ -1511,7 +1533,7 @@ MultiGrid::vcycle ()
     // rescor = rhs - L(cor)
     gsrb_4_residual<false, true>(
         m_system_type, m_domain[0], m_cor[0].array(), m_rhs.const_array(),
-        m_acf[0].const_array(), m_rescor[0].array(), m_sol.array(), m_dx, m_dy);
+        m_acf[0].const_array(), m_rescor[0].array(), m_sol.const_array(), m_dx, m_dy);
 
 #if defined(AMREX_USE_CUDA)
         cudaStreamEndCapture(amrex::Gpu::gpuStream(), &m_cuda_graph_vcycle[key].first);
@@ -1535,79 +1557,76 @@ MultiGrid::bottomsolve ()
         int const corner_offset = m_domain[0].cellCentered() ? 0 : 1;
 
         if (m_system_type == 1) {
-            bottomsolve_gpu<nsweeps,1>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs,
-                                       corner_offset,
-                [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi,
-                                     amrex::Array4<amrex::Real> const& phi,
-                                     amrex::Array4<amrex::Real> const& rhs,
-                                     amrex::Array4<amrex::Real> const& acf,
+            bottomsolve_gpu<nsweeps,1>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a,
+                                       m_lev_domain.data(), nlevs, corner_offset,
+                [] AMREX_GPU_DEVICE (int i, int j, int n, int ilo, int jlo, int ihi, int jhi,
+                                     Array3<amrex::Real> const& phi,
+                                     Array3<amrex::Real> const& rhs,
+                                     Array3<amrex::Real> const& acf,
                                      amrex::Real facx, amrex::Real facy)
                 {
                     amrex::Real a = acf(i,j,0);
-                    gs1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), a, facx, facy);
-                    gs1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), a, facx, facy);
+                    gs1(i, j, n, ilo, jlo, ihi, jhi, phi, rhs(i,j,n), a, facx, facy);
                 },
-                [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4<amrex::Real> const& res,
+                [] AMREX_GPU_DEVICE (int i, int j, int n, Array3<amrex::Real> const& res,
                                      int ilo, int jlo, int ihi, int jhi,
-                                     amrex::Array4<amrex::Real> const& phi,
-                                     amrex::Array4<amrex::Real> const& rhs,
-                                     amrex::Array4<amrex::Real> const& acf,
+                                     Array3<amrex::Real> const& phi,
+                                     Array3<amrex::Real> const& rhs,
+                                     Array3<amrex::Real> const& acf,
                                      amrex::Real facx, amrex::Real facy)
                 {
                     amrex::Real a = acf(i,j,0);
-                    res(i,j,0,0) = residual1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), a,
-                                             facx, facy);
-                    res(i,j,0,1) = residual1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), a,
-                                             facx, facy);
+                    res(i,j,n) = residual1(i, j, n, ilo, jlo, ihi, jhi, phi, rhs(i,j,n), a,
+                                           facx, facy);
                 });
         } else if (m_system_type == 2) {
-            bottomsolve_gpu<nsweeps,2>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs,
-                                       corner_offset,
-                [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi,
-                                     amrex::Array4<amrex::Real> const& phi,
-                                     amrex::Array4<amrex::Real> const& rhs,
-                                     amrex::Array4<amrex::Real> const& acf,
+            bottomsolve_gpu<nsweeps,2>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a,
+                                       m_lev_domain.data(), nlevs, corner_offset,
+                [] AMREX_GPU_DEVICE (int i, int j, int, int ilo, int jlo, int ihi, int jhi,
+                                     Array3<amrex::Real> const& phi,
+                                     Array3<amrex::Real> const& rhs,
+                                     Array3<amrex::Real> const& acf,
                                      amrex::Real facx, amrex::Real facy)
                 {
-                    amrex::Real ar = acf(i,j,0,0);
-                    amrex::Real ai = acf(i,j,0,1);
-                    gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), rhs(i,j,0,1), ar, ai,
+                    amrex::Real ar = acf(i,j,0);
+                    amrex::Real ai = acf(i,j,1);
+                    gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), rhs(i,j,1), ar, ai,
                         facx, facy);
                 },
-                [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4<amrex::Real> const& res,
+                [] AMREX_GPU_DEVICE (int i, int j, int, Array3<amrex::Real> const& res,
                                      int ilo, int jlo, int ihi, int jhi,
-                                     amrex::Array4<amrex::Real> const& phi,
-                                     amrex::Array4<amrex::Real> const& rhs,
-                                     amrex::Array4<amrex::Real> const& acf,
+                                     Array3<amrex::Real> const& phi,
+                                     Array3<amrex::Real> const& rhs,
+                                     Array3<amrex::Real> const& acf,
                                      amrex::Real facx, amrex::Real facy)
                 {
-                    amrex::Real ar = acf(i,j,0,0);
-                    amrex::Real ai = acf(i,j,0,1);
-                    res(i,j,0,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), ar, ai,
-                                              facx, facy);
-                    res(i,j,0,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), ar, ai,
-                                              facx, facy);
+                    amrex::Real ar = acf(i,j,0);
+                    amrex::Real ai = acf(i,j,1);
+                    res(i,j,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), ar, ai,
+                                            facx, facy);
+                    res(i,j,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,1), ar, ai,
+                                            facx, facy);
                 });
         } else {
-            bottomsolve_gpu<nsweeps,3>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs,
-                                       corner_offset,
-                [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi,
-                                     amrex::Array4<amrex::Real> const& phi,
-                                     amrex::Array4<amrex::Real> const& rhs,
-                                     amrex::Array4<amrex::Real> const&, amrex::Real facx,
+            bottomsolve_gpu<nsweeps,3>(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a,
+                                       m_lev_domain.data(), nlevs, corner_offset,
+                [] AMREX_GPU_DEVICE (int i, int j, int, int ilo, int jlo, int ihi, int jhi,
+                                     Array3<amrex::Real> const& phi,
+                                     Array3<amrex::Real> const& rhs,
+                                     Array3<amrex::Real> const&, amrex::Real facx,
                                      amrex::Real facy)
                 {
-                    gs3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), facx, facy);
+                    gs3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), facx, facy);
                 },
-                [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4<amrex::Real> const& res,
+                [] AMREX_GPU_DEVICE (int i, int j, int, Array3<amrex::Real> const& res,
                                      int ilo, int jlo, int ihi, int jhi,
-                                     amrex::Array4<amrex::Real> const& phi,
-                                     amrex::Array4<amrex::Real> const& rhs,
-                                     amrex::Array4<amrex::Real> const&, amrex::Real facx,
+                                     Array3<amrex::Real> const& phi,
+                                     Array3<amrex::Real> const& rhs,
+                                     Array3<amrex::Real> const&, amrex::Real facx,
                                      amrex::Real facy)
                 {
-                    res(i,j,0,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0),
-                                             facx, facy);
+                    res(i,j,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0),
+                                           facx, facy);
                 });
         }
     } else
@@ -1632,7 +1651,8 @@ MultiGrid::bottomsolve ()
 #if defined(AMREX_USE_GPU)
 namespace {
     template <typename F>
-    void avgdown_acf (amrex::Array4<amrex::Real> const* acf, int ncomp, int nlevels, F&& f)
+    void avgdown_acf (Array3<amrex::Real> const* acf, amrex::BoxND<2> const* lev_domain,
+                      int ncomp, int nlevels, F&& f)
     {
 #if defined(AMREX_USE_DPCPP)
         amrex::launch(1, 1024, amrex::Gpu::gpuStream(),
@@ -1643,8 +1663,8 @@ namespace {
 #endif
         {
             for (int ilev = 1; ilev < nlevels; ++ilev) {
-                const int lenx = acf[ilev].end.x - acf[ilev].begin.x;
-                const int leny = acf[ilev].end.y - acf[ilev].begin.y;
+                const int lenx = lev_domain[ilev].length(0);
+                const int leny = lev_domain[ilev].length(1);
                 const int ncells = lenx*leny;
 #if defined(AMREX_USE_DPCPP)
                 for (int icell = item.get_local_range(0)*item.get_group_linear_id()
@@ -1656,10 +1676,10 @@ namespace {
                      icell < ncells; icell += stride) {
                     int j = icell /   lenx;
                     int i = icell - j*lenx;
-                    j += acf[ilev].begin.y;
-                    i += acf[ilev].begin.x;
+                    j += lev_domain[ilev].smallEnd(1);
+                    i += lev_domain[ilev].smallEnd(0);
                     for (int n = 0; n < ncomp; ++n) {
-                        f(i,j,n,acf[ilev],acf[ilev-1]);
+                        f(i,j,n,acf[ilev],acf[ilev-1],lev_domain[ilev]);
                     }
                 }
                 HPMG_SYNCTHREADS;
@@ -1678,8 +1698,8 @@ MultiGrid::average_down_acoef ()
 #endif
 
     for (int ilev = 1; ilev <= m_single_block_level_begin; ++ilev) {
-        auto const& crse = m_acf[ilev].array();
-        auto const& fine = m_acf[ilev-1].const_array();
+        const Array3<amrex::Real>& crse = m_acf[ilev].array();
+        const Array3<const amrex::Real>& fine = m_acf[ilev-1].const_array();
         if (m_domain[ilev].cellCentered()) {
             hpmg::ParallelFor(to2D(m_domain[ilev]), m_num_comps_acf,
             [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept
@@ -1698,24 +1718,26 @@ MultiGrid::average_down_acoef ()
 #if defined(AMREX_USE_GPU)
     if (m_num_single_block_levels > 1) {
         if (m_domain[0].cellCentered()) {
-            avgdown_acf(m_acf_a, m_num_comps_acf, m_num_single_block_levels,
+            avgdown_acf(m_acf_a, m_lev_domain.data(), m_num_comps_acf, m_num_single_block_levels,
                         [] AMREX_GPU_DEVICE (int i, int j, int n,
-                                             amrex::Array4<amrex::Real> const& crse,
-                                             amrex::Array4<amrex::Real> const& fine) noexcept
+                                             Array3<amrex::Real> const& crse,
+                                             Array3<amrex::Real> const& fine,
+                                             amrex::BoxND<2> const&) noexcept
                         {
                             restrict_cc(i,j,n,crse,fine);
                         });
         } else {
-            avgdown_acf(m_acf_a, m_num_comps_acf, m_num_single_block_levels,
+            avgdown_acf(m_acf_a, m_lev_domain.data(), m_num_comps_acf, m_num_single_block_levels,
                         [] AMREX_GPU_DEVICE (int i, int j, int n,
-                                             amrex::Array4<amrex::Real> const& crse,
-                                             amrex::Array4<amrex::Real> const& fine) noexcept
+                                             Array3<amrex::Real> const& crse,
+                                             Array3<amrex::Real> const& fine,
+                                             amrex::BoxND<2> const& bx) noexcept
                         {
-                            if (i == crse.begin.x ||
-                                j == crse.begin.y ||
-                                i == crse.end.x-1 ||
-                                j == crse.end.y-1) {
-                                crse(i,j,0,n) = amrex::Real(0.);
+                            if (i == bx.smallEnd(0) ||
+                                j == bx.smallEnd(1) ||
+                                i == bx.bigEnd(0) ||
+                                j == bx.bigEnd(1)) {
+                                crse(i,j,n) = amrex::Real(0.);
                             } else {
                                 restrict_nd(i,j,n,crse,fine);
                             }

From 1649206c999926601416ab728f83f53b00a1fc5f Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Fri, 27 Feb 2026 08:45:24 +0100
Subject: [PATCH 3/5] siampp26

---
 src/mg_solver/HpMultiGrid.cpp |  4 ++--
 src/utils/MultiBuffer.cpp     | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/mg_solver/HpMultiGrid.cpp b/src/mg_solver/HpMultiGrid.cpp
index d81f3efd98..cd8d2cbcbf 100644
--- a/src/mg_solver/HpMultiGrid.cpp
+++ b/src/mg_solver/HpMultiGrid.cpp
@@ -960,8 +960,8 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, Array3<amrex::Real> cons
                     if (system_type == 1) {
                         restrict_nd(i,j,n,res[ilev+1],rescor[ilev]);
                     } else if (system_type == 2) {
-                        restrict_cc(i,j,0,res[ilev+1],rescor[ilev]);
-                        restrict_cc(i,j,1,res[ilev+1],rescor[ilev]);
+                        restrict_nd(i,j,0,res[ilev+1],rescor[ilev]);
+                        restrict_nd(i,j,1,res[ilev+1],rescor[ilev]);
                     } else {
                         restrict_nd(i,j,0,res[ilev+1],rescor[ilev]);
                     }
diff --git a/src/utils/MultiBuffer.cpp b/src/utils/MultiBuffer.cpp
index eaae5addcf..32cf9a1055 100644
--- a/src/utils/MultiBuffer.cpp
+++ b/src/utils/MultiBuffer.cpp
@@ -574,6 +574,18 @@ void MultiBuffer::get_data (int slice, MultiBeam& beams, MultiLaser& laser, int
     }
     m_datanodes[slice].m_progress = comm_progress::in_use;
     m_datanodes[slice].m_metadata_progress = comm_progress::in_use;
+
+    std::stringstream ss;
+
+    ss << "STATEDATA " << amrex::second() << " " << amrex::ParallelDescriptor::MyProc() << " ";
+    for (int i = m_datanodes.size()-1; i>=0; --i) {
+        if (i != m_datanodes.size()-1) {
+            ss << " ";
+        }
+        ss << m_datanodes[i].m_progress;
+    }
+    ss << "\n";
+    amrex::AllPrint() << ss.str();
 }
 
 void MultiBuffer::put_data (int slice, MultiBeam& beams, MultiLaser& laser, int beam_slice,
@@ -620,6 +632,18 @@ void MultiBuffer::put_data (int slice, MultiBeam& beams, MultiLaser& laser, int
         }
     }
 
+    std::stringstream ss;
+
+    ss << "STATEDATA " << amrex::second() << " " << amrex::ParallelDescriptor::MyProc() << " ";
+    for (int i = m_datanodes.size()-1; i>=0; --i) {
+        if (i != m_datanodes.size()-1) {
+            ss << " ";
+        }
+        ss << m_datanodes[i].m_progress;
+    }
+    ss << "\n";
+    amrex::AllPrint() << ss.str();
+
     async_progress(slice);
 }
 

From c6c6fe751328a599b4bbaa5ddd102f62f00b3033 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Fri, 27 Feb 2026 08:49:13 +0100
Subject: [PATCH 4/5] revert

---
 src/utils/MultiBuffer.cpp | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/src/utils/MultiBuffer.cpp b/src/utils/MultiBuffer.cpp
index 32cf9a1055..585d9cd649 100644
--- a/src/utils/MultiBuffer.cpp
+++ b/src/utils/MultiBuffer.cpp
@@ -576,16 +576,6 @@ void MultiBuffer::get_data (int slice, MultiBeam& beams, MultiLaser& laser, int
     m_datanodes[slice].m_metadata_progress = comm_progress::in_use;
 
     std::stringstream ss;
-
-    ss << "STATEDATA " << amrex::second() << " " << amrex::ParallelDescriptor::MyProc() << " ";
-    for (int i = m_datanodes.size()-1; i>=0; --i) {
-        if (i != m_datanodes.size()-1) {
-            ss << " ";
-        }
-        ss << m_datanodes[i].m_progress;
-    }
-    ss << "\n";
-    amrex::AllPrint() << ss.str();
 }
 
 void MultiBuffer::put_data (int slice, MultiBeam& beams, MultiLaser& laser, int beam_slice,
@@ -632,18 +622,6 @@ void MultiBuffer::put_data (int slice, MultiBeam& beams, MultiLaser& laser, int
         }
     }
 
-    std::stringstream ss;
-
-    ss << "STATEDATA " << amrex::second() << " " << amrex::ParallelDescriptor::MyProc() << " ";
-    for (int i = m_datanodes.size()-1; i>=0; --i) {
-        if (i != m_datanodes.size()-1) {
-            ss << " ";
-        }
-        ss << m_datanodes[i].m_progress;
-    }
-    ss << "\n";
-    amrex::AllPrint() << ss.str();
-
     async_progress(slice);
 }
 

From 6566a7af0c8ec95287d0a7e5ecd15f72c898230b Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Fri, 27 Feb 2026 08:50:36 +0100
Subject: [PATCH 5/5] revert 2

---
 src/utils/MultiBuffer.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/utils/MultiBuffer.cpp b/src/utils/MultiBuffer.cpp
index 585d9cd649..eaae5addcf 100644
--- a/src/utils/MultiBuffer.cpp
+++ b/src/utils/MultiBuffer.cpp
@@ -574,8 +574,6 @@ void MultiBuffer::get_data (int slice, MultiBeam& beams, MultiLaser& laser, int
     }
     m_datanodes[slice].m_progress = comm_progress::in_use;
     m_datanodes[slice].m_metadata_progress = comm_progress::in_use;
-
-    std::stringstream ss;
 }
 
 void MultiBuffer::put_data (int slice, MultiBeam& beams, MultiLaser& laser, int beam_slice,