From c84bcecee73a8a735d8918e627b0959dc93d939a Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 10 Dec 2025 17:46:41 +0100 Subject: [PATCH 1/5] Remove using namespace amrex from HpMultiGrid.cpp --- src/mg_solver/HpMultiGrid.cpp | 542 ++++++++++++++++++---------------- 1 file changed, 289 insertions(+), 253 deletions(-) diff --git a/src/mg_solver/HpMultiGrid.cpp b/src/mg_solver/HpMultiGrid.cpp index ba9a52d9de..f6c6bbb3dc 100644 --- a/src/mg_solver/HpMultiGrid.cpp +++ b/src/mg_solver/HpMultiGrid.cpp @@ -9,8 +9,6 @@ #include "utils/GPUUtil.H" #include -using namespace amrex; - namespace hpmg { namespace { @@ -19,9 +17,9 @@ namespace { constexpr int n_cell_single = 32; // switch to single block when box is smaller than this #endif -Box valid_domain_box (Box const& domain) +amrex::Box valid_domain_box (amrex::Box const& domain) { - return domain.cellCentered() ? domain : amrex::grow(domain, IntVect(-1,-1,0)); + return domain.cellCentered() ? domain : amrex::grow(domain, amrex::IntVect(-1,-1,0)); } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -30,9 +28,9 @@ Box valid_domain_box (Box const& domain) template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void restrict_cc (int i, int j, int n, Array4 const& crse, Array4 const& fine) +void restrict_cc (int i, int j, int n, amrex::Array4 const& crse, amrex::Array4 const& fine) { - crse(i,j,0,n) = Real(0.25)*(fine(2*i ,2*j ,0,n) + + crse(i,j,0,n) = amrex::Real(0.25)*(fine(2*i ,2*j ,0,n) + fine(2*i+1,2*j ,0,n) + fine(2*i ,2*j+1,0,n) + fine(2*i+1,2*j+1,0,n)); @@ -40,22 +38,22 @@ void restrict_cc (int i, int j, int n, Array4 const& crse, Array4 const& f template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void restrict_nd (int i, int j, int n, Array4 const& crse, Array4 const& fine) +void restrict_nd (int i, int j, int n, amrex::Array4 const& crse, amrex::Array4 const& fine) { - crse(i,j,0,n) = Real(1./16.) * (fine(2*i-1,2*j-1,0,n) + - Real(2.)*fine(2*i ,2*j-1,0,n) + + crse(i,j,0,n) = amrex::Real(1./16.) * (fine(2*i-1,2*j-1,0,n) + + amrex::Real(2.)*fine(2*i ,2*j-1,0,n) + fine(2*i+1,2*j-1,0,n) + - Real(2.)*fine(2*i-1,2*j ,0,n) + - Real(4.)*fine(2*i ,2*j ,0,n) + - Real(2.)*fine(2*i+1,2*j ,0,n) + + amrex::Real(2.)*fine(2*i-1,2*j ,0,n) + + amrex::Real(4.)*fine(2*i ,2*j ,0,n) + + amrex::Real(2.)*fine(2*i+1,2*j ,0,n) + fine(2*i-1,2*j+1,0,n) + - Real(2.)*fine(2*i ,2*j+1,0,n) + + amrex::Real(2.)*fine(2*i ,2*j+1,0,n) + fine(2*i+1,2*j+1,0,n)); } template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void interpadd_cc (int i, int j, int n, Array4 const& fine, Array4 const& crse) +void interpadd_cc (int i, int j, int n, amrex::Array4 const& fine, amrex::Array4 const& crse) { int ic = amrex::coarsen(i,2); int jc = amrex::coarsen(j,2); @@ -64,7 +62,7 @@ void interpadd_cc (int i, int j, int n, Array4 const& fine, Array4 const& template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void interpadd_nd (int i, int j, int n, Array4 const& fine, Array4 const& crse) +void interpadd_nd (int i, int j, int n, amrex::Array4 const& fine, amrex::Array4 const& crse) { int ic = amrex::coarsen(i,2); int jc = amrex::coarsen(j,2); @@ -74,13 +72,13 @@ void interpadd_nd (int i, int j, int n, Array4 const& fine, Array4 const& fine(i,j,0,n) += (crse(ic ,jc ,0,n) + crse(ic+1,jc ,0,n) + crse(ic ,jc+1,0,n) + - crse(ic+1,jc+1,0,n))*Real(0.25); + crse(ic+1,jc+1,0,n))*amrex::Real(0.25); } else if (i_is_odd) { fine(i,j,0,n) += (crse(ic ,jc,0,n) + - crse(ic+1,jc,0,n))*Real(0.5); + crse(ic+1,jc,0,n))*amrex::Real(0.5); } else if (j_is_odd) { fine(i,j,0,n) += (crse(ic,jc ,0,n) + - crse(ic,jc+1,0,n))*Real(0.5); + crse(ic,jc+1,0,n))*amrex::Real(0.5); } else { fine(i,j,0,n) += crse(ic,jc,0,n); } @@ -89,8 +87,8 @@ void interpadd_nd (int i, int j, int n, Array4 const& fine, Array4 const& // out of place version used before shared memory gsrb template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void interpcpy_cc (int i, int j, int n, Array4 const& fine_in, Array4 const& crse, - Array4 const& fine_out) +void interpcpy_cc (int i, int j, int n, amrex::Array4 const& fine_in, + amrex::Array4 const& crse, amrex::Array4 const& fine_out) { int ic = amrex::coarsen(i,2); int jc = amrex::coarsen(j,2); @@ -99,8 +97,8 @@ void interpcpy_cc (int i, int j, int n, Array4 const& fine_in, Array4 cons template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void interpcpy_nd (int i, int j, int n, Array4 const& fine_in, Array4 const& crse, - Array4 const& fine_out) +void interpcpy_nd (int i, int j, int n, amrex::Array4 const& fine_in, + amrex::Array4 const& crse, amrex::Array4 const& fine_out) { int ic = amrex::coarsen(i,2); int jc = amrex::coarsen(j,2); @@ -110,20 +108,20 @@ void interpcpy_nd (int i, int j, int n, Array4 const& fine_in, Array4 cons fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic ,jc ,0,n) + crse(ic+1,jc ,0,n) + crse(ic ,jc+1,0,n) + - crse(ic+1,jc+1,0,n))*Real(0.25); + crse(ic+1,jc+1,0,n))*amrex::Real(0.25); } else if (i_is_odd) { fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic ,jc,0,n) + - crse(ic+1,jc,0,n))*Real(0.5); + crse(ic+1,jc,0,n))*amrex::Real(0.5); } else if (j_is_odd) { fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic,jc ,0,n) + - crse(ic,jc+1,0,n))*Real(0.5); + crse(ic,jc+1,0,n))*amrex::Real(0.5); } else { fine_out(i,j,0,n) = fine_in(i,j,0,n) + crse(ic,jc,0,n); } } -void restriction (Box const& box, Array4 const& crse, Array4 const& fine, - int num_comps) +void restriction (amrex::Box const& box, amrex::Array4 const& crse, + amrex::Array4 const& fine, int num_comps) { if (box.cellCentered()) { hpmg::ParallelFor(to2D(box), num_comps, [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept @@ -139,9 +137,10 @@ void restriction (Box const& box, Array4 const& crse, Array4 c } } -void interpolation_outofplace (Box const& box, Array4 const& fine_in, - Array4 const& crse, Array4 const& fine_out, - int num_comps) +void interpolation_outofplace (amrex::Box const& box, + amrex::Array4 const& fine_in, + amrex::Array4 const& crse, + amrex::Array4 const& fine_out, int num_comps) { if (box.cellCentered()) { hpmg::ParallelFor(to2D(box), num_comps, [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept @@ -162,21 +161,21 @@ void interpolation_outofplace (Box const& box, Array4 const& fine_in //////////////////////////////////////////////////////////////////////////////////////////////////// AMREX_GPU_DEVICE AMREX_FORCE_INLINE -Real laplacian (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Real facx, Real facy) +amrex::Real laplacian (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, + amrex::Array4 const& phi, amrex::Real facx, amrex::Real facy) { - Real lap = Real(-2.)*(facx+facy)*phi(i,j,0,n); + amrex::Real lap = amrex::Real(-2.)*(facx+facy)*phi(i,j,0,n); if (i == ilo) { - lap += facx * (Real(4./3.)*phi(i+1,j,0,n) - Real(2.)*phi(i,j,0,n)); + lap += facx * (amrex::Real(4./3.)*phi(i+1,j,0,n) - amrex::Real(2.)*phi(i,j,0,n)); } else if (i == ihi) { - lap += facx * (Real(4./3.)*phi(i-1,j,0,n) - Real(2.)*phi(i,j,0,n)); + lap += facx * (amrex::Real(4./3.)*phi(i-1,j,0,n) - amrex::Real(2.)*phi(i,j,0,n)); } else { lap += facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n)); } if (j == jlo) { - lap += facy * (Real(4./3.)*phi(i,j+1,0,n) - Real(2.)*phi(i,j,0,n)); + lap += facy * (amrex::Real(4./3.)*phi(i,j+1,0,n) - amrex::Real(2.)*phi(i,j,0,n)); } else if (j == jhi) { - lap += facy * (Real(4./3.)*phi(i,j-1,0,n) - Real(2.)*phi(i,j,0,n)); + lap += facy * (amrex::Real(4./3.)*phi(i,j-1,0,n) - amrex::Real(2.)*phi(i,j,0,n)); } else { lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n)); } @@ -184,52 +183,57 @@ Real laplacian (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, } AMREX_GPU_DEVICE AMREX_FORCE_INLINE -Real residual1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Real rhs, Real acf, Real facx, Real facy) +amrex::Real residual1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, + amrex::Array4 const& phi, amrex::Real rhs, amrex::Real acf, + amrex::Real facx, amrex::Real facy) { - Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy); + amrex::Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy); return rhs + acf*phi(i,j,0,n) - lap; } AMREX_GPU_DEVICE AMREX_FORCE_INLINE -Real residual2r (int i, int j, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Real rhs, Real acf_r, Real acf_i, - Real facx, Real facy) +amrex::Real residual2r (int i, int j, int ilo, int jlo, int ihi, int jhi, + amrex::Array4 const& phi, amrex::Real rhs, amrex::Real acf_r, + amrex::Real acf_i, + amrex::Real facx, amrex::Real facy) { - Real lap = laplacian(i,j,0,ilo,jlo,ihi,jhi,phi,facx,facy); + amrex::Real lap = laplacian(i,j,0,ilo,jlo,ihi,jhi,phi,facx,facy); return rhs + acf_r*phi(i,j,0,0) - acf_i*phi(i,j,0,1) - lap; } AMREX_GPU_DEVICE AMREX_FORCE_INLINE -Real residual2i (int i, int j, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Real rhs, Real acf_r, Real acf_i, - Real facx, Real facy) +amrex::Real residual2i (int i, int j, int ilo, int jlo, int ihi, int jhi, + amrex::Array4 const& phi, amrex::Real rhs, amrex::Real acf_r, + amrex::Real acf_i, + amrex::Real facx, amrex::Real facy) { - Real lap = laplacian(i,j,1,ilo,jlo,ihi,jhi,phi,facx,facy); + amrex::Real lap = laplacian(i,j,1,ilo,jlo,ihi,jhi,phi,facx,facy); return rhs + acf_i*phi(i,j,0,0) + acf_r*phi(i,j,0,1) - lap; } AMREX_GPU_DEVICE AMREX_FORCE_INLINE -Real residual3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Real rhs, Real facx, Real facy) +amrex::Real residual3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, + amrex::Array4 const& phi, amrex::Real rhs, amrex::Real facx, + amrex::Real facy) { - Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy); + amrex::Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy); return rhs - lap; } #if !defined(AMREX_USE_CUDA) && !defined(AMREX_USE_HIP) && defined(AMREX_USE_GPU) // res = rhs - L(phi) -void compute_residual (Box const& box, Array4 const& res, - Array4 const& phi, Array4 const& rhs, - Array4 const& acf, Real dx, Real dy, +void compute_residual (amrex::Box const& box, amrex::Array4 const& res, + amrex::Array4 const& phi, + amrex::Array4 const& rhs, + amrex::Array4 const& acf, amrex::Real dx, amrex::Real dy, int system_type) { int const ilo = box.smallEnd(0); int const jlo = box.smallEnd(1); int const ihi = box.bigEnd(0); int const jhi = box.bigEnd(1); - Real facx = Real(1.)/(dx*dx); - Real facy = Real(1.)/(dy*dy); + amrex::Real facx = amrex::Real(1.)/(dx*dx); + amrex::Real facy = amrex::Real(1.)/(dy*dy); if (system_type == 1) { hpmg::ParallelFor(to2D(valid_domain_box(box)), [=] AMREX_GPU_DEVICE (int i, int j) noexcept @@ -267,29 +271,30 @@ void compute_residual (Box const& box, Array4 const& res, template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void gs1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Real rhs, Real acf, Real facx, Real facy) + amrex::Array4 const& phi, amrex::Real rhs, amrex::Real acf, amrex::Real facx, + amrex::Real facy) { - Real lap; - Real c0 = -(acf+Real(2.)*(facx+facy)); + amrex::Real lap; + amrex::Real c0 = -(acf+amrex::Real(2.)*(facx+facy)); if (is_cell_centered && i == ilo) { - lap = facx * Real(4./3.)*phi(i+1,j,0,n); - c0 -= Real(2.)*facx; + lap = facx * amrex::Real(4./3.)*phi(i+1,j,0,n); + c0 -= amrex::Real(2.)*facx; } else if (is_cell_centered && i == ihi) { - lap = facx * Real(4./3.)*phi(i-1,j,0,n); - c0 -= Real(2.)*facx; + lap = facx * amrex::Real(4./3.)*phi(i-1,j,0,n); + c0 -= amrex::Real(2.)*facx; } else { lap = facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n)); } if (is_cell_centered && j == jlo) { - lap += facy * Real(4./3.)*phi(i,j+1,0,n); - c0 -= Real(2.)*facy; + lap += facy * amrex::Real(4./3.)*phi(i,j+1,0,n); + c0 -= amrex::Real(2.)*facy; } else if (is_cell_centered && j == jhi) { - lap += facy * Real(4./3.)*phi(i,j-1,0,n); - c0 -= Real(2.)*facy; + lap += facy * amrex::Real(4./3.)*phi(i,j-1,0,n); + c0 -= amrex::Real(2.)*facy; } else { lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n)); } - const Real c0_inv = Real(1.) / c0; + const amrex::Real c0_inv = amrex::Real(1.) / c0; phi(i,j,0,n) = (rhs - lap) * c0_inv; } @@ -298,37 +303,37 @@ void gs1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void gs2 (int i, int j, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Real rhs_r, Real rhs_i, - Real ar, Real ai, Real facx, Real facy) + amrex::Array4 const& phi, amrex::Real rhs_r, amrex::Real rhs_i, + amrex::Real ar, amrex::Real ai, amrex::Real facx, amrex::Real facy) { - Real lap[2]; - Real c0 = Real(-2.)*(facx+facy); + amrex::Real lap[2]; + amrex::Real c0 = amrex::Real(-2.)*(facx+facy); if (is_cell_centered && i == ilo) { - lap[0] = facx * Real(4./3.)*phi(i+1,j,0,0); - lap[1] = facx * Real(4./3.)*phi(i+1,j,0,1); - c0 -= Real(2.)*facx; + lap[0] = facx * amrex::Real(4./3.)*phi(i+1,j,0,0); + lap[1] = facx * amrex::Real(4./3.)*phi(i+1,j,0,1); + c0 -= amrex::Real(2.)*facx; } else if (is_cell_centered && i == ihi) { - lap[0] = facx * Real(4./3.)*phi(i-1,j,0,0); - lap[1] = facx * Real(4./3.)*phi(i-1,j,0,1); - c0 -= Real(2.)*facx; + lap[0] = facx * amrex::Real(4./3.)*phi(i-1,j,0,0); + lap[1] = facx * amrex::Real(4./3.)*phi(i-1,j,0,1); + c0 -= amrex::Real(2.)*facx; } else { lap[0] = facx * (phi(i-1,j,0,0) + phi(i+1,j,0,0)); lap[1] = facx * (phi(i-1,j,0,1) + phi(i+1,j,0,1)); } if (is_cell_centered && j == jlo) { - lap[0] += facy * Real(4./3.)*phi(i,j+1,0,0); - lap[1] += facy * Real(4./3.)*phi(i,j+1,0,1); - c0 -= Real(2.)*facy; + lap[0] += facy * amrex::Real(4./3.)*phi(i,j+1,0,0); + lap[1] += facy * amrex::Real(4./3.)*phi(i,j+1,0,1); + c0 -= amrex::Real(2.)*facy; } else if (is_cell_centered && j == jhi) { - lap[0] += facy * Real(4./3.)*phi(i,j-1,0,0); - lap[1] += facy * Real(4./3.)*phi(i,j-1,0,1); - c0 -= Real(2.)*facy; + lap[0] += facy * amrex::Real(4./3.)*phi(i,j-1,0,0); + lap[1] += facy * amrex::Real(4./3.)*phi(i,j-1,0,1); + c0 -= amrex::Real(2.)*facy; } else { lap[0] += facy * (phi(i,j-1,0,0) + phi(i,j+1,0,0)); lap[1] += facy * (phi(i,j-1,0,1) + phi(i,j+1,0,1)); } - Real c[2] = {c0-ar, -ai}; - Real cmag = Real(1.)/(c[0]*c[0] + c[1]*c[1]); + amrex::Real c[2] = {c0-ar, -ai}; + amrex::Real cmag = amrex::Real(1.)/(c[0]*c[0] + c[1]*c[1]); c[0] *= cmag; c[1] *= cmag; phi(i,j,0,0) = (rhs_r-lap[0])*c[0] + (rhs_i-lap[1])*c[1]; @@ -340,42 +345,43 @@ void gs2 (int i, int j, int ilo, int jlo, int ihi, int jhi, template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void gs3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Real rhs, Real facx, Real facy) + amrex::Array4 const& phi, amrex::Real rhs, amrex::Real facx, + amrex::Real facy) { - Real lap; - Real c0 = -Real(2.)*(facx+facy); + amrex::Real lap; + amrex::Real c0 = -amrex::Real(2.)*(facx+facy); if (is_cell_centered && i == ilo) { - lap = facx * Real(4./3.)*phi(i+1,j,0,n); - c0 -= Real(2.)*facx; + lap = facx * amrex::Real(4./3.)*phi(i+1,j,0,n); + c0 -= amrex::Real(2.)*facx; } else if (is_cell_centered && i == ihi) { - lap = facx * Real(4./3.)*phi(i-1,j,0,n); - c0 -= Real(2.)*facx; + lap = facx * amrex::Real(4./3.)*phi(i-1,j,0,n); + c0 -= amrex::Real(2.)*facx; } else { lap = facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n)); } if (is_cell_centered && j == jlo) { - lap += facy * Real(4./3.)*phi(i,j+1,0,n); - c0 -= Real(2.)*facy; + lap += facy * amrex::Real(4./3.)*phi(i,j+1,0,n); + c0 -= amrex::Real(2.)*facy; } else if (is_cell_centered && j == jhi) { - lap += facy * Real(4./3.)*phi(i,j-1,0,n); - c0 -= Real(2.)*facy; + lap += facy * amrex::Real(4./3.)*phi(i,j-1,0,n); + c0 -= amrex::Real(2.)*facy; } else { lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n)); } - const Real c0_inv = Real(1.) / c0; + const amrex::Real c0_inv = amrex::Real(1.) / c0; phi(i,j,0,n) = (rhs - lap) * c0_inv; } -void gsrb (int icolor, Box const& box, Array4 const& phi, - Array4 const& rhs, Array4 const& acf, - Real dx, Real dy, int system_type) +void gsrb (int icolor, amrex::Box const& box, amrex::Array4 const& phi, + amrex::Array4 const& rhs, amrex::Array4 const& acf, + amrex::Real dx, amrex::Real dy, int system_type) { int const ilo = box.smallEnd(0); int const jlo = box.smallEnd(1); int const ihi = box.bigEnd(0); int const jhi = box.bigEnd(1); - Real facx = Real(1.)/(dx*dx); - Real facy = Real(1.)/(dy*dy); + amrex::Real facx = amrex::Real(1.)/(dx*dx); + amrex::Real facy = amrex::Real(1.)/(dy*dy); if (system_type == 1) { hpmg::ParallelFor(to2D(valid_domain_box(box)), [=] AMREX_GPU_DEVICE (int i, int j) noexcept @@ -413,9 +419,10 @@ void gsrb (int icolor, Box const& box, Array4 const& phi, // do multiple gsrb iterations in GPU shared memory with many ghost cells template -void gsrb_shared (Box const& box, Array4 const& phi_out, Array4 const& rhs, - Array4 const& acf, Array4 const& res, - Array4 const& phi_in, Real dx, Real dy) +void gsrb_shared (amrex::Box const& box, amrex::Array4 const& phi_out, + amrex::Array4 const& rhs, + amrex::Array4 const& acf, amrex::Array4 const& res, + amrex::Array4 const& phi_in, amrex::Real dx, amrex::Real dy) { constexpr int num_comps = MultiGrid::get_num_comps(system_type); constexpr int num_comps_acf = MultiGrid::get_num_comps_acf(system_type); @@ -437,11 +444,11 @@ void gsrb_shared (Box const& box, Array4 const& phi_out, Array4 const& phi_out, Array4 const& phi_out, Array4 phi_shared(phi_ptr, {tile_begin_x, tile_begin_y, 0}, + // make amrex::Array4 reference shared memory tile + amrex::Array4 phi_shared(phi_ptr, {tile_begin_x, tile_begin_y, 0}, {tile_end_x, tile_end_y, 1}, num_comps); if (zero_init) { // initialize shared memory to zero for (int s = threadIdx.x; s < num_cells_in_tile; s+=blockDim.x) { - phi_ptr[s] = Real(0.); + phi_ptr[s] = amrex::Real(0.); } } else { // initialize shared memory to phi_in inside the domain, outside zero @@ -489,7 +496,7 @@ void gsrb_shared (Box const& box, Array4 const& phi_out, Array4 const& phi_out, Array4 rhs_num[2] = {{}, {}}; - amrex::GpuArray acf_num[2] = {{}, {}}; + amrex::GpuArray rhs_num[2] = {{}, {}}; + amrex::GpuArray acf_num[2] = {{}, {}}; for (int nj=0; nj<=1; ++nj) { if (ilo_loop <= i && i <= ihi_loop && @@ -595,9 +602,11 @@ void gsrb_shared (Box const& box, Array4 const& phi_out, Array4 -void gsrb_cached (Box const& box, Array4 const& phi_out, Array4 const& rhs, - Array4 const& acf, Array4 const& res, - Array4 const& phi_in, Real dx, Real dy) +void gsrb_cached (amrex::Box const& box, amrex::Array4 const& phi_out, + amrex::Array4 const& rhs, + amrex::Array4 const& acf, + amrex::Array4 const& res, + amrex::Array4 const& phi_in, amrex::Real dx, amrex::Real dy) { constexpr int num_comps = MultiGrid::get_num_comps(system_type); constexpr int tilesize_x = 64; @@ -616,11 +625,11 @@ void gsrb_cached (Box const& box, Array4 const& phi_out, Array4 const& phi_out, Array4 const& phi_out, Array4 phi_cached(phi_ptr, {tile_begin_x, tile_begin_y, 0}, + // make amrex::Array4 reference cached memory tile + amrex::Array4 phi_cached(phi_ptr, {tile_begin_x, tile_begin_y, 0}, {tile_end_x, tile_end_y, 1}, num_comps); if (zero_init) { // initialize cached memory to zero for (int s = 0; s < tilesize_array_x * tilesize_array_y * num_comps; ++s) { - phi_ptr[s] = Real(0.); + phi_ptr[s] = amrex::Real(0.); } } else { // initialize cached memory to phi_in inside the domain, outside zero @@ -663,7 +672,7 @@ void gsrb_cached (Box const& box, Array4 const& phi_out, Array4 const& phi_out, Array4 -void gsrb_4_residual (int system_type, Box const& box, - Array4 const& phi_out, - Array4 const& rhs, - Array4 const& acf, - Array4 const& res, - Array4 const& phi_in, - Real dx, Real dy) +void gsrb_4_residual (int system_type, amrex::Box const& box, + amrex::Array4 const& phi_out, + amrex::Array4 const& rhs, + amrex::Array4 const& acf, + amrex::Array4 const& res, + amrex::Array4 const& phi_in, + amrex::Real dx, amrex::Real dy) { // This function performs the main computation for the multigrid solver. // For CUDA / HIP it is implemented to use a single GPU kernel with shared memory @@ -816,9 +825,9 @@ void gsrb_4_residual (int system_type, Box const& box, #else if (zero_init) { - Real * pcor_out = phi_out.dataPtr(); + amrex::Real * pcor_out = phi_out.dataPtr(); hpmg::ParallelFor(box.numPts()*MultiGrid::get_num_comps(system_type), - [=] AMREX_GPU_DEVICE (Long i) noexcept { pcor_out[i] = Real(0.); }); + [=] AMREX_GPU_DEVICE (Long i) noexcept { pcor_out[i] = amrex::Real(0.); }); } else { const amrex::Box valid_domain = valid_domain_box(box); hpmg::ParallelFor(to2D(box), MultiGrid::get_num_comps(system_type), @@ -826,7 +835,7 @@ void gsrb_4_residual (int system_type, Box const& box, if (valid_domain.contains(i,j,0)) { phi_out(i,j,0,n) = phi_in(i,j,0,n); } else { - phi_out(i,j,0,n) = Real(0.); + phi_out(i,j,0,n) = amrex::Real(0.); } }); } @@ -854,9 +863,9 @@ void gsrb_4_residual (int system_type, Box const& box, #endif template -void bottomsolve_gpu (Real dx0, Real dy0, Array4 const* acf, - Array4 const* res, Array4 const* cor, - Array4 const* rescor, int nlevs, int corner_offset, +void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4 const* acf, + amrex::Array4 const* res, amrex::Array4 const* cor, + amrex::Array4 const* rescor, int nlevs, int corner_offset, FGS&& fgs, FRES&& fres) { // This function performs all the operations of a vcycle in a single GPU kernel by @@ -867,15 +876,15 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4 const* acf, static_assert(n_cell_single*n_cell_single <= 1024, "n_cell_single is too big"); #if defined(AMREX_USE_DPCPP) - amrex::launch(1, 1024, Gpu::gpuStream(), + amrex::launch(1, 1024, amrex::Gpu::gpuStream(), [=] (sycl::nd_item<1> const& item) noexcept #else - amrex::launch_global<1024><<<1, 1024, 0, Gpu::gpuStream()>>>( + amrex::launch_global<1024><<<1, 1024, 0, amrex::Gpu::gpuStream()>>>( [=] AMREX_GPU_DEVICE () noexcept #endif { - Real facx = Real(1.)/(dx0*dx0); - Real facy = Real(1.)/(dy0*dy0); + amrex::Real facx = amrex::Real(1.)/(dx0*dx0); + amrex::Real facy = amrex::Real(1.)/(dy0*dy0); int lenx = cor[0].end.x - cor[0].begin.x - 2*corner_offset; int leny = cor[0].end.y - cor[0].begin.y - 2*corner_offset; int ncells = lenx*leny; @@ -893,10 +902,10 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4 const* acf, // set phi to zero if (icell < ncells) { if (system_type == 1 || system_type == 2) { - cor[ilev](i,j,0,0) = Real(0.); - cor[ilev](i,j,0,1) = Real(0.); + cor[ilev](i,j,0,0) = amrex::Real(0.); + cor[ilev](i,j,0,1) = amrex::Real(0.); } else { - cor[ilev](i,j,0,0) = Real(0.); + cor[ilev](i,j,0,0) = amrex::Real(0.); } } HPMG_SYNCTHREADS; @@ -954,8 +963,8 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4 const* acf, } HPMG_SYNCTHREADS; - facx *= Real(0.25); - facy *= Real(0.25); + facx *= amrex::Real(0.25); + facy *= amrex::Real(0.25); } // bottom @@ -964,10 +973,10 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4 const* acf, const int ilev = nlevs-1; if (icell < ncells) { if (system_type == 1 || system_type == 2) { - cor[ilev](i,j,0,0) = Real(0.); - cor[ilev](i,j,0,1) = Real(0.); + cor[ilev](i,j,0,0) = amrex::Real(0.); + cor[ilev](i,j,0,1) = amrex::Real(0.); } else { - cor[ilev](i,j,0,0) = Real(0.); + cor[ilev](i,j,0,0) = amrex::Real(0.); } } HPMG_SYNCTHREADS; @@ -992,8 +1001,8 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4 const* acf, lenx = cor[ilev].end.x - cor[ilev].begin.x - 2*corner_offset; leny = cor[ilev].end.y - cor[ilev].begin.y - 2*corner_offset; ncells = lenx*leny; - facx *= Real(4.); - facy *= Real(4.); + facx *= amrex::Real(4.); + facy *= amrex::Real(4.); if (icell < ncells) { j = icell / lenx; @@ -1042,20 +1051,21 @@ void bottomsolve_gpu (Real dx0, Real dy0, Array4 const* acf, // Initialization: ///////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////// -MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type) - : m_system_type(a_system_type), m_dx(dx), m_dy(dy) +MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a_system_type) + : m_system_type(a_system_type), + m_num_comps(get_num_comps(a_system_type)), + m_num_comps_acf(get_num_comps_acf(a_system_type)), + m_dx(dx), + m_dy(dy) { - m_num_comps = get_num_comps(m_system_type); - m_num_comps_acf = get_num_comps_acf(m_system_type); - - IntVect const a_domain_len = a_domain.length(); + amrex::IntVect const a_domain_len = a_domain.length(); AMREX_ALWAYS_ASSERT(a_domain_len[2] == 1 && a_domain.cellCentered() && a_domain_len[0]%2 == a_domain_len[1]%2); - IndexType const index_type = (a_domain_len[0]%2 == 0) ? - IndexType::TheCellType() : IndexType(IntVect(1,1,0)); - m_domain.push_back(amrex::makeSlab(Box{{0,0,0}, a_domain_len-1, index_type}, 2, 0)); + amrex::IndexType const index_type = (a_domain_len[0]%2 == 0) ? + amrex::IndexType::TheCellType() : amrex::IndexType(amrex::IntVect(1,1,0)); + m_domain.push_back(amrex::makeSlab(amrex::Box{{0,0,0}, a_domain_len-1, index_type}, 2, 0)); if (!index_type.cellCentered()) { m_domain[0].growHi(0,2).growHi(1,2); m_boundary_condition_offset = 1.; @@ -1064,18 +1074,19 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type) m_boundary_condition_offset = 0.5; m_boundary_condition_factor = 8./3.; } - IntVect const min_width = index_type.cellCentered() ? IntVect(2,2,1) : IntVect(4,4,1); + amrex::IntVect const min_width = index_type.cellCentered() ? + amrex::IntVect(2,2,1) : amrex::IntVect(4,4,1); for (int i = 0; i < 30; ++i) { - if (m_domain.back().coarsenable(IntVect(2,2,1), min_width)) { - m_domain.push_back(amrex::coarsen(m_domain.back(),IntVect(2,2,1))); + if (m_domain.back().coarsenable(amrex::IntVect(2,2,1), min_width)) { + m_domain.push_back(amrex::coarsen(m_domain.back(),amrex::IntVect(2,2,1))); } else { break; } } - m_max_level = m_domain.size()-1; + m_max_level = static_cast(m_domain.size()-1); #if defined(AMREX_USE_GPU) auto r = std::find_if(std::begin(m_domain), std::end(m_domain), - [=] (Box const& b) -> bool + [=] (amrex::Box const& b) -> bool { return b.volume() <= n_cell_single*n_cell_single; }); m_single_block_level_begin = std::distance(std::begin(m_domain), r); m_single_block_level_begin = std::max(1, m_single_block_level_begin); @@ -1087,7 +1098,8 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type) << " cannot be coarsened enough times to be solved efficiently.\n" << "hpmg: Size of the final MG level: " << m_domain[m_max_level].length(0) << " " << m_domain[m_max_level].length(1) << ".\n" - << "hpmg: Please consider using a domain size of the form '2^n', '3*2^n', '2^n+1' or '3*n^2+1'.\n"; + << "hpmg: Please consider using a domain size of the form " + << "'2^n', '3*2^n', '2^n+1' or '3*n^2+1'.\n"; } #else m_single_block_level_begin = m_max_level; @@ -1120,7 +1132,7 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type) } else { m_res.emplace_back(m_domain[ilev], m_num_comps); if (!index_type.cellCentered()) { - m_res[ilev].template setVal(0); + m_res[ilev].template setVal(0); } if (ilev >= m_single_block_level_begin) { m_array4.push_back(m_res[ilev].array()); @@ -1133,7 +1145,7 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type) for (int ilev = 0; ilev < m_num_mg_levels; ++ilev) { m_cor.emplace_back(m_domain[ilev], m_num_comps); if (!index_type.cellCentered()) { - m_cor[ilev].template setVal(0); + m_cor[ilev].template setVal(0); } if (ilev >= m_single_block_level_begin) { m_array4.push_back(m_cor[ilev].array()); @@ -1145,14 +1157,14 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type) for (int ilev = 0; ilev < m_num_mg_levels; ++ilev) { m_rescor.emplace_back(m_domain[ilev], m_num_comps); if (!index_type.cellCentered()) { - m_rescor[ilev].template setVal(0); + m_rescor[ilev].template setVal(0); } if (ilev >= m_single_block_level_begin) { m_array4.push_back(m_rescor[ilev].array()); } } - // Copy Array4s to GPU for bottomsolve_gpu() + // Copy amrex::Array4s to GPU for bottomsolve_gpu() if (!m_array4.empty()) { m_array4.copyToDeviceAsync(); m_acf_a = m_array4.data(); @@ -1167,14 +1179,14 @@ MultiGrid::MultiGrid (Real dx, Real dy, Box a_domain, int a_system_type) //////////////////////////////////////////////////////////////////////////////////////////////////// void -MultiGrid::solve1 (FArrayBox& a_sol, FArrayBox const& a_rhs, FArrayBox const& a_acf, - Real const tol_rel, Real const tol_abs, int const nummaxiter, - int const verbose) +MultiGrid::solve1 (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs, + amrex::FArrayBox const& a_acf, amrex::Real const tol_rel, + amrex::Real const tol_abs, int const nummaxiter, int const verbose) { HIPACE_PROFILE("hpmg::MultiGrid::solve1()"); AMREX_ALWAYS_ASSERT(m_system_type == 1); - FArrayBox afab(center_box(a_acf.box(), m_domain.front()), 1, a_acf.dataPtr()); + amrex::FArrayBox afab(center_box(a_acf.box(), m_domain.front()), 1, a_acf.dataPtr()); auto const& array_m_acf = m_acf[0].array(); auto const& array_a_acf = afab.const_array(); @@ -1290,8 +1302,8 @@ MultiGrid::solve2 (amrex::FArrayBox& sol, amrex::FArrayBox const& rhs, } void -MultiGrid::solve3 (FArrayBox& a_sol, FArrayBox const& a_rhs, - Real const tol_rel, Real const tol_abs, int const nummaxiter, +MultiGrid::solve3 (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs, + amrex::Real const tol_rel, amrex::Real const tol_abs, int const nummaxiter, int const verbose) { HIPACE_PROFILE("hpmg::MultiGrid::solve3()"); @@ -1305,14 +1317,16 @@ MultiGrid::solve3 (FArrayBox& a_sol, FArrayBox const& a_rhs, //////////////////////////////////////////////////////////////////////////////////////////////////// void -MultiGrid::solve_doit (FArrayBox& a_sol, FArrayBox const& a_rhs, - Real const tol_rel, Real const tol_abs, int const nummaxiter, +MultiGrid::solve_doit (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs, + amrex::Real const tol_rel, amrex::Real const tol_abs, int const nummaxiter, int const verbose) { AMREX_ALWAYS_ASSERT(a_sol.nComp() >= m_num_comps && a_rhs.nComp() >= m_num_comps); - m_sol = FArrayBox(center_box(a_sol.box(), m_domain.front()), m_num_comps, a_sol.dataPtr()); - m_rhs = FArrayBox(center_box(a_rhs.box(), m_domain.front()), m_num_comps, a_rhs.dataPtr()); + m_sol = amrex::FArrayBox(center_box(a_sol.box(), m_domain.front()), m_num_comps, + a_sol.dataPtr()); + m_rhs = amrex::FArrayBox(center_box(a_rhs.box(), m_domain.front()), m_num_comps, + a_rhs.dataPtr()); // sol: initial solution guess // rhs: right hand side of differential equation to solve @@ -1327,10 +1341,10 @@ MultiGrid::solve_doit (FArrayBox& a_sol, FArrayBox const& a_rhs, m_acf[0].const_array(), m_rescor[0].array(), m_sol.array(), m_dx, m_dy); // Reduction on the right-hand side and residual to get convergence criteria - Real resnorm0, rhsnorm0; + amrex::Real resnorm0, rhsnorm0; { - ReduceOps reduce_op; - ReduceData reduce_data(reduce_op); + amrex::ReduceOps reduce_op; + amrex::ReduceData reduce_data(reduce_op); using ReduceTuple = typename decltype(reduce_data)::Type; const auto& array_res = m_rescor[0].const_array(); const auto& array_rhs = m_rhs.const_array(); @@ -1349,7 +1363,7 @@ MultiGrid::solve_doit (FArrayBox& a_sol, FArrayBox const& a_rhs, << "hpmg: Initial residual (resid0) = " << resnorm0 << "\n"; } - Real max_norm; + amrex::Real max_norm; std::string norm_name; if (rhsnorm0 >= resnorm0) { norm_name = "bnorm"; @@ -1358,28 +1372,26 @@ MultiGrid::solve_doit (FArrayBox& a_sol, FArrayBox const& a_rhs, norm_name = "resid0"; max_norm = resnorm0; } - const Real res_target = std::max(tol_abs, std::max(tol_rel,Real(1.e-16))*max_norm); + const amrex::Real res_target = std::max(tol_abs,std::max(tol_rel,amrex::Real(1.e-16))*max_norm); if (resnorm0 <= res_target) { if (verbose >= 1) { amrex::Print() << "hpmg: No iterations needed\n"; } } else { - Real norminf = 0.; + amrex::Real norminf = 0.; bool converged = true; for (int iter = 0; iter < nummaxiter; ++iter) { - converged = false; - // do one vcycle iteration with the fist 4 Gauss-Seidel iterations omitted // from the beginning and instead done after the vcycle to also get the residual vcycle(); // check the residual for convergence - Real const* pres0 = m_rescor[0].dataPtr(); - norminf = Reduce::Max(m_domain[0].numPts()*m_num_comps, - [=] AMREX_GPU_DEVICE (Long i) -> Real + amrex::Real const* pres0 = m_rescor[0].dataPtr(); + norminf = amrex::Reduce::Max(m_domain[0].numPts()*m_num_comps, + [=] AMREX_GPU_DEVICE (amrex::Long i) -> amrex::Real { return std::abs(pres0[i]); }); @@ -1399,7 +1411,7 @@ MultiGrid::solve_doit (FArrayBox& a_sol, FArrayBox const& a_rhs, } AMREX_ALWAYS_ASSERT_WITH_MESSAGE( - converged || (norminf <= Real(1.e20)*max_norm), + converged || (norminf <= amrex::Real(1.e20)*max_norm), "hpmg: Failing to converge after " + amrex::ToString(iter+1) + " iterations." " resid, resid/" + norm_name + " = " + amrex::ToString(norminf) + ", " + amrex::ToString(norminf/max_norm) + "\n" @@ -1434,14 +1446,14 @@ MultiGrid::vcycle () }; if (m_cuda_graph_vcycle.count(key) == 0) { - cudaStreamBeginCapture(Gpu::gpuStream(), cudaStreamCaptureModeGlobal); + cudaStreamBeginCapture(amrex::Gpu::gpuStream(), cudaStreamCaptureModeGlobal); #endif for (int ilev = 0; ilev < m_single_block_level_begin; ++ilev) { - Real fac = static_cast(1 << ilev); - Real dx = m_dx * fac; - Real dy = m_dy * fac; + amrex::Real fac = static_cast(1 << ilev); + amrex::Real dx = m_dx * fac; + amrex::Real dy = m_dy * fac; if (ilev > 0) { // cor and residual on ilev 0 are already calculated before the vcycle is started @@ -1460,16 +1472,17 @@ MultiGrid::vcycle () // interpolate residual to next level // res[ilev+1] = R(rescor[ilev]) - restriction(m_domain[ilev+1], m_res[ilev+1].array(), m_rescor[ilev].const_array(), m_num_comps); + restriction(m_domain[ilev+1], m_res[ilev+1].array(), m_rescor[ilev].const_array(), + m_num_comps); } bottomsolve(); for (int ilev = m_single_block_level_begin-1; ilev >= 0; --ilev) { - Real fac = static_cast(1 << ilev); - Real dx = m_dx * fac; - Real dy = m_dy * fac; + amrex::Real fac = static_cast(1 << ilev); + amrex::Real dx = m_dx * fac; + amrex::Real dy = m_dy * fac; // interpolate solution from previous level to phi @@ -1501,11 +1514,11 @@ MultiGrid::vcycle () m_acf[0].const_array(), m_rescor[0].array(), m_sol.array(), m_dx, m_dy); #if defined(AMREX_USE_CUDA) - cudaStreamEndCapture(Gpu::gpuStream(), &m_cuda_graph_vcycle[key].first); + cudaStreamEndCapture(amrex::Gpu::gpuStream(), &m_cuda_graph_vcycle[key].first); cudaGraphInstantiate(&m_cuda_graph_vcycle[key].second, m_cuda_graph_vcycle[key].first, NULL, NULL, 0); } - cudaGraphLaunch(m_cuda_graph_vcycle[key].second, Gpu::gpuStream()); + cudaGraphLaunch(m_cuda_graph_vcycle[key].second, amrex::Gpu::gpuStream()); #endif } @@ -1513,74 +1526,95 @@ void MultiGrid::bottomsolve () { constexpr int nsweeps = 16; - Real fac = static_cast(1 << m_single_block_level_begin); - Real dx0 = m_dx * fac; - Real dy0 = m_dy * fac; + amrex::Real fac = static_cast(1 << m_single_block_level_begin); + amrex::Real dx0 = m_dx * fac; + amrex::Real dy0 = m_dy * fac; #if defined(AMREX_USE_GPU) if (m_use_single_block_kernel) { int nlevs = m_num_single_block_levels; int const corner_offset = m_domain[0].cellCentered() ? 0 : 1; if (m_system_type == 1) { - bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, corner_offset, + bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, + corner_offset, [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Array4 const& rhs, - Array4 const& acf, Real facx, Real facy) + amrex::Array4 const& phi, + amrex::Array4 const& rhs, + amrex::Array4 const& acf, + amrex::Real facx, amrex::Real facy) { - Real a = acf(i,j,0); + amrex::Real a = acf(i,j,0); gs1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), a, facx, facy); gs1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), a, facx, facy); }, - [] AMREX_GPU_DEVICE (int i, int j, Array4 const& res, - int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Array4 const& rhs, - Array4 const& acf, Real facx, Real facy) + [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4 const& res, + int ilo, int jlo, int ihi, int jhi, + amrex::Array4 const& phi, + amrex::Array4 const& rhs, + amrex::Array4 const& acf, + amrex::Real facx, amrex::Real facy) { - Real a = acf(i,j,0); - res(i,j,0,0) = residual1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), a, facx, facy); - res(i,j,0,1) = residual1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), a, facx, facy); + amrex::Real a = acf(i,j,0); + res(i,j,0,0) = residual1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), a, + facx, facy); + res(i,j,0,1) = residual1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), a, + facx, facy); }); } else if (m_system_type == 2) { - bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, corner_offset, + bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, + corner_offset, [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Array4 const& rhs, - Array4 const& acf, Real facx, Real facy) + amrex::Array4 const& phi, + amrex::Array4 const& rhs, + amrex::Array4 const& acf, + amrex::Real facx, amrex::Real facy) { - Real ar = acf(i,j,0,0); - Real ai = acf(i,j,0,1); - gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), rhs(i,j,0,1), ar, ai, facx, facy); + amrex::Real ar = acf(i,j,0,0); + amrex::Real ai = acf(i,j,0,1); + gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), rhs(i,j,0,1), ar, ai, + facx, facy); }, - [] AMREX_GPU_DEVICE (int i, int j, Array4 const& res, - int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Array4 const& rhs, - Array4 const& acf, Real facx, Real facy) + [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4 const& res, + int ilo, int jlo, int ihi, int jhi, + amrex::Array4 const& phi, + amrex::Array4 const& rhs, + amrex::Array4 const& acf, + amrex::Real facx, amrex::Real facy) { - Real ar = acf(i,j,0,0); - Real ai = acf(i,j,0,1); - res(i,j,0,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), ar, ai, facx, facy); - res(i,j,0,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), ar, ai, facx, facy); + amrex::Real ar = acf(i,j,0,0); + amrex::Real ai = acf(i,j,0,1); + res(i,j,0,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), ar, ai, + facx, facy); + res(i,j,0,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), ar, ai, + facx, facy); }); } else { - bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, corner_offset, + bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, + corner_offset, [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Array4 const& rhs, - Array4 const&, Real facx, Real facy) + amrex::Array4 const& phi, + amrex::Array4 const& rhs, + amrex::Array4 const&, amrex::Real facx, + amrex::Real facy) { gs3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), facx, facy); }, - [] AMREX_GPU_DEVICE (int i, int j, Array4 const& res, - int ilo, int jlo, int ihi, int jhi, - Array4 const& phi, Array4 const& rhs, - Array4 const&, Real facx, Real facy) + [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4 const& res, + int ilo, int jlo, int ihi, int jhi, + amrex::Array4 const& phi, + amrex::Array4 const& rhs, + amrex::Array4 const&, amrex::Real facx, + amrex::Real facy) { - res(i,j,0,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), facx, facy); + res(i,j,0,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), + facx, facy); }); } } else #endif { const int ilev = m_single_block_level_begin; - m_cor[ilev].setVal(Real(0.)); + m_cor[ilev].setVal(amrex::Real(0.)); // Use numsweeps equal to the box length rounded up to an even number for large boxes int numsweeps = std::max(nsweeps, (m_cor[ilev].box().length().max() + 1) / 2 * 2); for (int is = 0; is < numsweeps; ++is) { @@ -1598,13 +1632,13 @@ MultiGrid::bottomsolve () #if defined(AMREX_USE_GPU) namespace { template - void avgdown_acf (Array4 const* acf, int ncomp, int nlevels, F&& f) + void avgdown_acf (amrex::Array4 const* acf, int ncomp, int nlevels, F&& f) { #if defined(AMREX_USE_DPCPP) - amrex::launch(1, 1024, Gpu::gpuStream(), + amrex::launch(1, 1024, amrex::Gpu::gpuStream(), [=] (sycl::nd_item<1> const& item) noexcept #else - amrex::launch_global<1024><<<1, 1024, 0, Gpu::gpuStream()>>>( + amrex::launch_global<1024><<<1, 1024, 0, amrex::Gpu::gpuStream()>>>( [=] AMREX_GPU_DEVICE () noexcept #endif { @@ -1640,7 +1674,7 @@ MultiGrid::average_down_acoef () { #if defined(AMREX_USE_CUDA) if (!m_cuda_graph_acf_created) { - cudaStreamBeginCapture(Gpu::gpuStream(), cudaStreamCaptureModeGlobal); + cudaStreamBeginCapture(amrex::Gpu::gpuStream(), cudaStreamCaptureModeGlobal); #endif for (int ilev = 1; ilev <= m_single_block_level_begin; ++ilev) { @@ -1665,21 +1699,23 @@ MultiGrid::average_down_acoef () if (m_num_single_block_levels > 1) { if (m_domain[0].cellCentered()) { avgdown_acf(m_acf_a, m_num_comps_acf, m_num_single_block_levels, - [] AMREX_GPU_DEVICE (int i, int j, int n, Array4 const& crse, - Array4 const& fine) noexcept + [] AMREX_GPU_DEVICE (int i, int j, int n, + amrex::Array4 const& crse, + amrex::Array4 const& fine) noexcept { restrict_cc(i,j,n,crse,fine); }); } else { avgdown_acf(m_acf_a, m_num_comps_acf, m_num_single_block_levels, - [] AMREX_GPU_DEVICE (int i, int j, int n, Array4 const& crse, - Array4 const& fine) noexcept + [] AMREX_GPU_DEVICE (int i, int j, int n, + amrex::Array4 const& crse, + amrex::Array4 const& fine) noexcept { if (i == crse.begin.x || j == crse.begin.y || i == crse.end.x-1 || j == crse.end.y-1) { - crse(i,j,0,n) = Real(0.); + crse(i,j,0,n) = amrex::Real(0.); } else { restrict_nd(i,j,n,crse,fine); } @@ -1689,11 +1725,11 @@ MultiGrid::average_down_acoef () #endif #if defined(AMREX_USE_CUDA) - cudaStreamEndCapture(Gpu::gpuStream(), &m_cuda_graph_acf); + cudaStreamEndCapture(amrex::Gpu::gpuStream(), &m_cuda_graph_acf); cudaGraphInstantiate(&m_cuda_graph_exe_acf, m_cuda_graph_acf, NULL, NULL, 0); m_cuda_graph_acf_created = true; } - cudaGraphLaunch(m_cuda_graph_exe_acf, Gpu::gpuStream()); + cudaGraphLaunch(m_cuda_graph_exe_acf, amrex::Gpu::gpuStream()); #endif } From ed521dfc3c0d7c967e9dea0f3fd613638e272812 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Mon, 12 Jan 2026 17:03:38 +0100 Subject: [PATCH 2/5] Use Array3 in HPMG solver --- src/mg_solver/HpMultiGrid.H | 12 +- src/mg_solver/HpMultiGrid.cpp | 700 ++++++++++++++++++---------------- 2 files changed, 368 insertions(+), 344 deletions(-) diff --git a/src/mg_solver/HpMultiGrid.H b/src/mg_solver/HpMultiGrid.H index f8c0ca9d39..a387c0cf8e 100644 --- a/src/mg_solver/HpMultiGrid.H +++ b/src/mg_solver/HpMultiGrid.H @@ -257,13 +257,15 @@ private: amrex::Vector m_rescor; /** Device pointer to Array4s used by the single-block kernel at the bottom */ - amrex::Array4 const* m_acf_a = nullptr; - amrex::Array4 const* m_res_a = nullptr; - amrex::Array4 const* m_cor_a = nullptr; - amrex::Array4 const* m_rescor_a = nullptr; + Array3 const* m_acf_a = nullptr; + Array3 const* m_res_a = nullptr; + Array3 const* m_cor_a = nullptr; + Array3 const* m_rescor_a = nullptr; /** Device vector of Array4s used by the single-block kernel at the bottom */ - amrex::Gpu::Buffer > m_array4; + amrex::Gpu::Buffer> m_array3; + /** Device vector of Boxes used by the single-block kernel at the bottom */ + amrex::Gpu::Buffer> m_lev_domain; #if defined(AMREX_USE_CUDA) /** CUDA graphs for average-down */ diff --git a/src/mg_solver/HpMultiGrid.cpp b/src/mg_solver/HpMultiGrid.cpp index a8bca6e1d1..aeea4c59ed 100644 --- a/src/mg_solver/HpMultiGrid.cpp +++ b/src/mg_solver/HpMultiGrid.cpp @@ -28,100 +28,100 @@ amrex::Box valid_domain_box (amrex::Box const& domain) template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void restrict_cc (int i, int j, int n, amrex::Array4 const& crse, amrex::Array4 const& fine) +void restrict_cc (int i, int j, int n, Array3 const& crse, Array3 const& fine) { - crse(i,j,0,n) = amrex::Real(0.25)*(fine(2*i ,2*j ,0,n) + - fine(2*i+1,2*j ,0,n) + - fine(2*i ,2*j+1,0,n) + - fine(2*i+1,2*j+1,0,n)); + crse(i,j,n) = amrex::Real(0.25)*(fine(2*i ,2*j ,n) + + fine(2*i+1,2*j ,n) + + fine(2*i ,2*j+1,n) + + fine(2*i+1,2*j+1,n)); } template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void restrict_nd (int i, int j, int n, amrex::Array4 const& crse, amrex::Array4 const& fine) +void restrict_nd (int i, int j, int n, Array3 const& crse, Array3 const& fine) { - crse(i,j,0,n) = amrex::Real(1./16.) * (fine(2*i-1,2*j-1,0,n) + - amrex::Real(2.)*fine(2*i ,2*j-1,0,n) + - fine(2*i+1,2*j-1,0,n) + - amrex::Real(2.)*fine(2*i-1,2*j ,0,n) + - amrex::Real(4.)*fine(2*i ,2*j ,0,n) + - amrex::Real(2.)*fine(2*i+1,2*j ,0,n) + - fine(2*i-1,2*j+1,0,n) + - amrex::Real(2.)*fine(2*i ,2*j+1,0,n) + - fine(2*i+1,2*j+1,0,n)); + crse(i,j,n) = amrex::Real(1./16.) * (fine(2*i-1,2*j-1,n) + + amrex::Real(2.)*fine(2*i ,2*j-1,n) + + fine(2*i+1,2*j-1,n) + + amrex::Real(2.)*fine(2*i-1,2*j ,n) + + amrex::Real(4.)*fine(2*i ,2*j ,n) + + amrex::Real(2.)*fine(2*i+1,2*j ,n) + + fine(2*i-1,2*j+1,n) + + amrex::Real(2.)*fine(2*i ,2*j+1,n) + + fine(2*i+1,2*j+1,n)); } template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void interpadd_cc (int i, int j, int n, amrex::Array4 const& fine, amrex::Array4 const& crse) +void interpadd_cc (int i, int j, int n, Array3 const& fine, Array3 const& crse) { int ic = amrex::coarsen(i,2); int jc = amrex::coarsen(j,2); - fine(i,j,0,n) += crse(ic,jc,0,n); + fine(i,j,n) += crse(ic,jc,n); } template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void interpadd_nd (int i, int j, int n, amrex::Array4 const& fine, amrex::Array4 const& crse) +void interpadd_nd (int i, int j, int n, Array3 const& fine, Array3 const& crse) { int ic = amrex::coarsen(i,2); int jc = amrex::coarsen(j,2); bool i_is_odd = (ic*2 != i); bool j_is_odd = (jc*2 != j); if (i_is_odd && j_is_odd) { - fine(i,j,0,n) += (crse(ic ,jc ,0,n) + - crse(ic+1,jc ,0,n) + - crse(ic ,jc+1,0,n) + - crse(ic+1,jc+1,0,n))*amrex::Real(0.25); + fine(i,j,n) += (crse(ic ,jc ,n) + + crse(ic+1,jc ,n) + + crse(ic ,jc+1,n) + + crse(ic+1,jc+1,n))*amrex::Real(0.25); } else if (i_is_odd) { - fine(i,j,0,n) += (crse(ic ,jc,0,n) + - crse(ic+1,jc,0,n))*amrex::Real(0.5); + fine(i,j,n) += (crse(ic ,jc,n) + + crse(ic+1,jc,n))*amrex::Real(0.5); } else if (j_is_odd) { - fine(i,j,0,n) += (crse(ic,jc ,0,n) + - crse(ic,jc+1,0,n))*amrex::Real(0.5); + fine(i,j,n) += (crse(ic,jc ,n) + + crse(ic,jc+1,n))*amrex::Real(0.5); } else { - fine(i,j,0,n) += crse(ic,jc,0,n); + fine(i,j,n) += crse(ic,jc,n); } } // out of place version used before shared memory gsrb template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void interpcpy_cc (int i, int j, int n, amrex::Array4 const& fine_in, - amrex::Array4 const& crse, amrex::Array4 const& fine_out) +void interpcpy_cc (int i, int j, int n, Array3 const& fine_in, + Array3 const& crse, Array3 const& fine_out) { int ic = amrex::coarsen(i,2); int jc = amrex::coarsen(j,2); - fine_out(i,j,0,n) = fine_in(i,j,0,n) + crse(ic,jc,0,n); + fine_out(i,j,n) = fine_in(i,j,n) + crse(ic,jc,n); } template AMREX_GPU_DEVICE AMREX_FORCE_INLINE -void interpcpy_nd (int i, int j, int n, amrex::Array4 const& fine_in, - amrex::Array4 const& crse, amrex::Array4 const& fine_out) +void interpcpy_nd (int i, int j, int n, Array3 const& fine_in, + Array3 const& crse, Array3 const& fine_out) { int ic = amrex::coarsen(i,2); int jc = amrex::coarsen(j,2); bool i_is_odd = (ic*2 != i); bool j_is_odd = (jc*2 != j); if (i_is_odd && j_is_odd) { - fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic ,jc ,0,n) + - crse(ic+1,jc ,0,n) + - crse(ic ,jc+1,0,n) + - crse(ic+1,jc+1,0,n))*amrex::Real(0.25); + fine_out(i,j,n) = fine_in(i,j,n) + (crse(ic ,jc ,n) + + crse(ic+1,jc ,n) + + crse(ic ,jc+1,n) + + crse(ic+1,jc+1,n))*amrex::Real(0.25); } else if (i_is_odd) { - fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic ,jc,0,n) + - crse(ic+1,jc,0,n))*amrex::Real(0.5); + fine_out(i,j,n) = fine_in(i,j,n) + (crse(ic ,jc,n) + + crse(ic+1,jc,n))*amrex::Real(0.5); } else if (j_is_odd) { - fine_out(i,j,0,n) = fine_in(i,j,0,n) + (crse(ic,jc ,0,n) + - crse(ic,jc+1,0,n))*amrex::Real(0.5); + fine_out(i,j,n) = fine_in(i,j,n) + (crse(ic,jc ,n) + + crse(ic,jc+1,n))*amrex::Real(0.5); } else { - fine_out(i,j,0,n) = fine_in(i,j,0,n) + crse(ic,jc,0,n); + fine_out(i,j,n) = fine_in(i,j,n) + crse(ic,jc,n); } } -void restriction (amrex::Box const& box, amrex::Array4 const& crse, - amrex::Array4 const& fine, int num_comps) +void restriction (amrex::Box const& box, Array3 const& crse, + Array3 const& fine, int num_comps) { if (box.cellCentered()) { hpmg::ParallelFor(to2D(box), num_comps, [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept @@ -138,9 +138,9 @@ void restriction (amrex::Box const& box, amrex::Array4 const& crse, } void interpolation_outofplace (amrex::Box const& box, - amrex::Array4 const& fine_in, - amrex::Array4 const& crse, - amrex::Array4 const& fine_out, int num_comps) + Array3 const& fine_in, + Array3 const& crse, + Array3 const& fine_out, int num_comps) { if (box.cellCentered()) { hpmg::ParallelFor(to2D(box), num_comps, [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept @@ -162,58 +162,58 @@ void interpolation_outofplace (amrex::Box const& box, AMREX_GPU_DEVICE AMREX_FORCE_INLINE amrex::Real laplacian (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, amrex::Real facx, amrex::Real facy) + Array3 const& phi, amrex::Real facx, amrex::Real facy) { - amrex::Real lap = amrex::Real(-2.)*(facx+facy)*phi(i,j,0,n); + amrex::Real lap = amrex::Real(-2.)*(facx+facy)*phi(i,j,n); if (i == ilo) { - lap += facx * (amrex::Real(4./3.)*phi(i+1,j,0,n) - amrex::Real(2.)*phi(i,j,0,n)); + lap += facx * (amrex::Real(4./3.)*phi(i+1,j,n) - amrex::Real(2.)*phi(i,j,n)); } else if (i == ihi) { - lap += facx * (amrex::Real(4./3.)*phi(i-1,j,0,n) - amrex::Real(2.)*phi(i,j,0,n)); + lap += facx * (amrex::Real(4./3.)*phi(i-1,j,n) - amrex::Real(2.)*phi(i,j,n)); } else { - lap += facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n)); + lap += facx * (phi(i-1,j,n) + phi(i+1,j,n)); } if (j == jlo) { - lap += facy * (amrex::Real(4./3.)*phi(i,j+1,0,n) - amrex::Real(2.)*phi(i,j,0,n)); + lap += facy * (amrex::Real(4./3.)*phi(i,j+1,n) - amrex::Real(2.)*phi(i,j,n)); } else if (j == jhi) { - lap += facy * (amrex::Real(4./3.)*phi(i,j-1,0,n) - amrex::Real(2.)*phi(i,j,0,n)); + lap += facy * (amrex::Real(4./3.)*phi(i,j-1,n) - amrex::Real(2.)*phi(i,j,n)); } else { - lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n)); + lap += facy * (phi(i,j-1,n) + phi(i,j+1,n)); } return lap; } AMREX_GPU_DEVICE AMREX_FORCE_INLINE amrex::Real residual1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, amrex::Real rhs, amrex::Real acf, + Array3 const& phi, amrex::Real rhs, amrex::Real acf, amrex::Real facx, amrex::Real facy) { amrex::Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy); - return rhs + acf*phi(i,j,0,n) - lap; + return rhs + acf*phi(i,j,n) - lap; } AMREX_GPU_DEVICE AMREX_FORCE_INLINE amrex::Real residual2r (int i, int j, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, amrex::Real rhs, amrex::Real acf_r, + Array3 const& phi, amrex::Real rhs, amrex::Real acf_r, amrex::Real acf_i, amrex::Real facx, amrex::Real facy) { amrex::Real lap = laplacian(i,j,0,ilo,jlo,ihi,jhi,phi,facx,facy); - return rhs + acf_r*phi(i,j,0,0) - acf_i*phi(i,j,0,1) - lap; + return rhs + acf_r*phi(i,j,0) - acf_i*phi(i,j,1) - lap; } AMREX_GPU_DEVICE AMREX_FORCE_INLINE amrex::Real residual2i (int i, int j, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, amrex::Real rhs, amrex::Real acf_r, + Array3 const& phi, amrex::Real rhs, amrex::Real acf_r, amrex::Real acf_i, amrex::Real facx, amrex::Real facy) { amrex::Real lap = laplacian(i,j,1,ilo,jlo,ihi,jhi,phi,facx,facy); - return rhs + acf_i*phi(i,j,0,0) + acf_r*phi(i,j,0,1) - lap; + return rhs + acf_i*phi(i,j,0) + acf_r*phi(i,j,1) - lap; } AMREX_GPU_DEVICE AMREX_FORCE_INLINE amrex::Real residual3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, amrex::Real rhs, amrex::Real facx, + Array3 const& phi, amrex::Real rhs, amrex::Real facx, amrex::Real facy) { amrex::Real lap = laplacian(i,j,n,ilo,jlo,ihi,jhi,phi,facx,facy); @@ -222,10 +222,10 @@ amrex::Real residual3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, #if !defined(AMREX_USE_CUDA) && !defined(AMREX_USE_HIP) && defined(AMREX_USE_GPU) // res = rhs - L(phi) -void compute_residual (amrex::Box const& box, amrex::Array4 const& res, - amrex::Array4 const& phi, - amrex::Array4 const& rhs, - amrex::Array4 const& acf, amrex::Real dx, amrex::Real dy, +void compute_residual (amrex::Box const& box, Array3 const& res, + Array3 const& phi, + Array3 const& rhs, + Array3 const& acf, amrex::Real dx, amrex::Real dy, int system_type) { int const ilo = box.smallEnd(0); @@ -238,25 +238,25 @@ void compute_residual (amrex::Box const& box, amrex::Array4 const& hpmg::ParallelFor(to2D(valid_domain_box(box)), [=] AMREX_GPU_DEVICE (int i, int j) noexcept { - res(i,j,0,0) = residual1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), - acf(i,j,0), facx, facy); - res(i,j,0,1) = residual1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), - acf(i,j,0), facx, facy); + res(i,j,0) = residual1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), + acf(i,j,0), facx, facy); + res(i,j,1) = residual1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,1), + acf(i,j,0), facx, facy); }); } else if (system_type == 2) { hpmg::ParallelFor(to2D(valid_domain_box(box)), [=] AMREX_GPU_DEVICE (int i, int j) noexcept { - res(i,j,0,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), - acf(i,j,0,0), acf(i,j,0,1), facx, facy); - res(i,j,0,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), - acf(i,j,0,0), acf(i,j,0,1), facx, facy); + res(i,j,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), + acf(i,j,0), acf(i,j,1), facx, facy); + res(i,j,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,1), + acf(i,j,0), acf(i,j,1), facx, facy); }); } else { hpmg::ParallelFor(to2D(valid_domain_box(box)), [=] AMREX_GPU_DEVICE (int i, int j) noexcept { - res(i,j,0,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), facx, facy); + res(i,j,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), facx, facy); }); } } @@ -271,31 +271,31 @@ void compute_residual (amrex::Box const& box, amrex::Array4 const& template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void gs1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, amrex::Real rhs, amrex::Real acf, amrex::Real facx, + Array3 const& phi, amrex::Real rhs, amrex::Real acf, amrex::Real facx, amrex::Real facy) { amrex::Real lap; amrex::Real c0 = -(acf+amrex::Real(2.)*(facx+facy)); if (is_cell_centered && i == ilo) { - lap = facx * amrex::Real(4./3.)*phi(i+1,j,0,n); + lap = facx * amrex::Real(4./3.)*phi(i+1,j,n); c0 -= amrex::Real(2.)*facx; } else if (is_cell_centered && i == ihi) { - lap = facx * amrex::Real(4./3.)*phi(i-1,j,0,n); + lap = facx * amrex::Real(4./3.)*phi(i-1,j,n); c0 -= amrex::Real(2.)*facx; } else { - lap = facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n)); + lap = facx * (phi(i-1,j,n) + phi(i+1,j,n)); } if (is_cell_centered && j == jlo) { - lap += facy * amrex::Real(4./3.)*phi(i,j+1,0,n); + lap += facy * amrex::Real(4./3.)*phi(i,j+1,n); c0 -= amrex::Real(2.)*facy; } else if (is_cell_centered && j == jhi) { - lap += facy * amrex::Real(4./3.)*phi(i,j-1,0,n); + lap += facy * amrex::Real(4./3.)*phi(i,j-1,n); c0 -= amrex::Real(2.)*facy; } else { - lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n)); + lap += facy * (phi(i,j-1,n) + phi(i,j+1,n)); } const amrex::Real c0_inv = amrex::Real(1.) / c0; - phi(i,j,0,n) = (rhs - lap) * c0_inv; + phi(i,j,n) = (rhs - lap) * c0_inv; } // is_cell_centered = true: supports both cell centered and node centered solves @@ -303,41 +303,41 @@ void gs1 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void gs2 (int i, int j, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, amrex::Real rhs_r, amrex::Real rhs_i, + Array3 const& phi, amrex::Real rhs_r, amrex::Real rhs_i, amrex::Real ar, amrex::Real ai, amrex::Real facx, amrex::Real facy) { amrex::Real lap[2]; amrex::Real c0 = amrex::Real(-2.)*(facx+facy); if (is_cell_centered && i == ilo) { - lap[0] = facx * amrex::Real(4./3.)*phi(i+1,j,0,0); - lap[1] = facx * amrex::Real(4./3.)*phi(i+1,j,0,1); + lap[0] = facx * amrex::Real(4./3.)*phi(i+1,j,0); + lap[1] = facx * amrex::Real(4./3.)*phi(i+1,j,1); c0 -= amrex::Real(2.)*facx; } else if (is_cell_centered && i == ihi) { - lap[0] = facx * amrex::Real(4./3.)*phi(i-1,j,0,0); - lap[1] = facx * amrex::Real(4./3.)*phi(i-1,j,0,1); + lap[0] = facx * amrex::Real(4./3.)*phi(i-1,j,0); + lap[1] = facx * amrex::Real(4./3.)*phi(i-1,j,1); c0 -= amrex::Real(2.)*facx; } else { - lap[0] = facx * (phi(i-1,j,0,0) + phi(i+1,j,0,0)); - lap[1] = facx * (phi(i-1,j,0,1) + phi(i+1,j,0,1)); + lap[0] = facx * (phi(i-1,j,0) + phi(i+1,j,0)); + lap[1] = facx * (phi(i-1,j,1) + phi(i+1,j,1)); } if (is_cell_centered && j == jlo) { - lap[0] += facy * amrex::Real(4./3.)*phi(i,j+1,0,0); - lap[1] += facy * amrex::Real(4./3.)*phi(i,j+1,0,1); + lap[0] += facy * amrex::Real(4./3.)*phi(i,j+1,0); + lap[1] += facy * amrex::Real(4./3.)*phi(i,j+1,1); c0 -= amrex::Real(2.)*facy; } else if (is_cell_centered && j == jhi) { - lap[0] += facy * amrex::Real(4./3.)*phi(i,j-1,0,0); - lap[1] += facy * amrex::Real(4./3.)*phi(i,j-1,0,1); + lap[0] += facy * amrex::Real(4./3.)*phi(i,j-1,0); + lap[1] += facy * amrex::Real(4./3.)*phi(i,j-1,1); c0 -= amrex::Real(2.)*facy; } else { - lap[0] += facy * (phi(i,j-1,0,0) + phi(i,j+1,0,0)); - lap[1] += facy * (phi(i,j-1,0,1) + phi(i,j+1,0,1)); + lap[0] += facy * (phi(i,j-1,0) + phi(i,j+1,0)); + lap[1] += facy * (phi(i,j-1,1) + phi(i,j+1,1)); } amrex::Real c[2] = {c0-ar, -ai}; amrex::Real cmag = amrex::Real(1.)/(c[0]*c[0] + c[1]*c[1]); c[0] *= cmag; c[1] *= cmag; - phi(i,j,0,0) = (rhs_r-lap[0])*c[0] + (rhs_i-lap[1])*c[1]; - phi(i,j,0,1) = (rhs_i-lap[1])*c[0] - (rhs_r-lap[0])*c[1]; + phi(i,j,0) = (rhs_r-lap[0])*c[0] + (rhs_i-lap[1])*c[1]; + phi(i,j,1) = (rhs_i-lap[1])*c[0] - (rhs_r-lap[0])*c[1]; } // is_cell_centered = true: supports both cell centered and node centered solves @@ -345,35 +345,35 @@ void gs2 (int i, int j, int ilo, int jlo, int ihi, int jhi, template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void gs3 (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, amrex::Real rhs, amrex::Real facx, + Array3 const& phi, amrex::Real rhs, amrex::Real facx, amrex::Real facy) { amrex::Real lap; amrex::Real c0 = -amrex::Real(2.)*(facx+facy); if (is_cell_centered && i == ilo) { - lap = facx * amrex::Real(4./3.)*phi(i+1,j,0,n); + lap = facx * amrex::Real(4./3.)*phi(i+1,j,n); c0 -= amrex::Real(2.)*facx; } else if (is_cell_centered && i == ihi) { - lap = facx * amrex::Real(4./3.)*phi(i-1,j,0,n); + lap = facx * amrex::Real(4./3.)*phi(i-1,j,n); c0 -= amrex::Real(2.)*facx; } else { - lap = facx * (phi(i-1,j,0,n) + phi(i+1,j,0,n)); + lap = facx * (phi(i-1,j,n) + phi(i+1,j,n)); } if (is_cell_centered && j == jlo) { - lap += facy * amrex::Real(4./3.)*phi(i,j+1,0,n); + lap += facy * amrex::Real(4./3.)*phi(i,j+1,n); c0 -= amrex::Real(2.)*facy; } else if (is_cell_centered && j == jhi) { - lap += facy * amrex::Real(4./3.)*phi(i,j-1,0,n); + lap += facy * amrex::Real(4./3.)*phi(i,j-1,n); c0 -= amrex::Real(2.)*facy; } else { - lap += facy * (phi(i,j-1,0,n) + phi(i,j+1,0,n)); + lap += facy * (phi(i,j-1,n) + phi(i,j+1,n)); } const amrex::Real c0_inv = amrex::Real(1.) / c0; - phi(i,j,0,n) = (rhs - lap) * c0_inv; + phi(i,j,n) = (rhs - lap) * c0_inv; } -void gsrb (int icolor, amrex::Box const& box, amrex::Array4 const& phi, - amrex::Array4 const& rhs, amrex::Array4 const& acf, +void gsrb (int icolor, amrex::Box const& box, Array3 const& phi, + Array3 const& rhs, Array3 const& acf, amrex::Real dx, amrex::Real dy, int system_type) { int const ilo = box.smallEnd(0); @@ -387,8 +387,8 @@ void gsrb (int icolor, amrex::Box const& box, amrex::Array4 const& [=] AMREX_GPU_DEVICE (int i, int j) noexcept { if ((i+j+icolor)%2 == 0) { - gs1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), acf(i,j,0), facx, facy); - gs1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), acf(i,j,0), facx, facy); + gs1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), acf(i,j,0), facx, facy); + gs1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,1), acf(i,j,0), facx, facy); } }); } else if (system_type == 2) { @@ -396,8 +396,8 @@ void gsrb (int icolor, amrex::Box const& box, amrex::Array4 const& [=] AMREX_GPU_DEVICE (int i, int j) noexcept { if ((i+j+icolor)%2 == 0) { - gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), rhs(i,j,0,1), - acf(i,j,0,0), acf(i,j,0,1), facx, facy); + gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), rhs(i,j,1), + acf(i,j,0), acf(i,j,1), facx, facy); } }); } else { @@ -405,7 +405,7 @@ void gsrb (int icolor, amrex::Box const& box, amrex::Array4 const& [=] AMREX_GPU_DEVICE (int i, int j) noexcept { if ((i+j+icolor)%2 == 0) { - gs3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), facx, facy); + gs3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), facx, facy); } }); } @@ -419,10 +419,10 @@ void gsrb (int icolor, amrex::Box const& box, amrex::Array4 const& // do multiple gsrb iterations in GPU shared memory with many ghost cells template -void gsrb_shared (amrex::Box const& box, amrex::Array4 const& phi_out, - amrex::Array4 const& rhs, - amrex::Array4 const& acf, amrex::Array4 const& res, - amrex::Array4 const& phi_in, amrex::Real dx, amrex::Real dy) +void gsrb_shared (amrex::Box const& box, Array3 const& phi_out, + Array3 const& rhs, + Array3 const& acf, Array3 const& res, + Array3 const& phi_in, amrex::Real dx, amrex::Real dy) { constexpr int num_comps = MultiGrid::get_num_comps(system_type); constexpr int num_comps_acf = MultiGrid::get_num_comps_acf(system_type); @@ -473,9 +473,9 @@ void gsrb_shared (amrex::Box const& box, amrex::Array4 const& phi_o const int tile_end_x = tile_begin_x + tilesize_array_x; const int tile_end_y = tile_begin_y + tilesize_array_y; - // make amrex::Array4 reference shared memory tile - amrex::Array4 phi_shared(phi_ptr, {tile_begin_x, tile_begin_y, 0}, - {tile_end_x, tile_end_y, 1}, num_comps); + // make Array3 reference shared memory tile + Array3 phi_shared({phi_ptr, {tile_begin_x, tile_begin_y, 0}, + {tile_end_x, tile_end_y, 1}, num_comps}); if (zero_init) { // initialize shared memory to zero @@ -492,11 +492,11 @@ void gsrb_shared (amrex::Box const& box, amrex::Array4 const& phi_o if (ilo_loop <= sx && sx <= ihi_loop && jlo_loop <= sy && sy <= jhi_loop) { for (int n=0; n const& phi_o jlo_loop <= j+nj && j+nj <= jhi_loop) { // load rhs and acf into registers to avoid memory accesses in the gs iterations for (int n=0; n const& phi_o // fuse with compute_residual kernel and reuse phi_shared // at the cost of one extra ghost cell in each direction if (system_type == 1) { - res(i, j+nj, 0, 0) = residual1( - i, j+nj, 0, ilo, jlo, ihi, jhi, phi_shared, - rhs_num[nj][0], acf_num[nj][0], facx, facy); - res(i, j+nj, 0, 1) = residual1( - i, j+nj, 1, ilo, jlo, ihi, jhi, phi_shared, - rhs_num[nj][1], acf_num[nj][0], facx, facy); + res(i, j+nj, 0) = residual1( + i, j+nj, 0, ilo, jlo, ihi, jhi, phi_shared, + rhs_num[nj][0], acf_num[nj][0], facx, facy); + res(i, j+nj, 1) = residual1( + i, j+nj, 1, ilo, jlo, ihi, jhi, phi_shared, + rhs_num[nj][1], acf_num[nj][0], facx, facy); } else if (system_type == 2) { - res(i, j+nj, 0, 0) = residual2r( - i, j+nj, ilo, jlo, ihi, jhi, phi_shared, - rhs_num[nj][0], acf_num[nj][0], acf_num[nj][1], - facx, facy); - res(i, j+nj, 0, 1) = residual2i( - i, j+nj, ilo, jlo, ihi, jhi, phi_shared, - rhs_num[nj][1], acf_num[nj][0], acf_num[nj][1], - facx, facy); + res(i, j+nj, 0) = residual2r( + i, j+nj, ilo, jlo, ihi, jhi, phi_shared, + rhs_num[nj][0], acf_num[nj][0], acf_num[nj][1], + facx, facy); + res(i, j+nj, 1) = residual2i( + i, j+nj, ilo, jlo, ihi, jhi, phi_shared, + rhs_num[nj][1], acf_num[nj][0], acf_num[nj][1], + facx, facy); } else if (system_type == 3) { - res(i, j+nj, 0, 0) = residual3( - i, j+nj, 0, ilo, jlo, ihi, jhi, phi_shared, - rhs_num[nj][0], facx, facy); + res(i, j+nj, 0) = residual3( + i, j+nj, 0, ilo, jlo, ihi, jhi, phi_shared, + rhs_num[nj][0], facx, facy); } } for (int n=0; n const& phi_o // do multiple gsrb iterations in CPU cached memory with many ghost cells template -void gsrb_cached (amrex::Box const& box, amrex::Array4 const& phi_out, - amrex::Array4 const& rhs, - amrex::Array4 const& acf, - amrex::Array4 const& res, - amrex::Array4 const& phi_in, amrex::Real dx, amrex::Real dy) +void gsrb_cached (amrex::Box const& box, Array3 const& phi_out, + Array3 const& rhs, + Array3 const& acf, + Array3 const& res, + Array3 const& phi_in, amrex::Real dx, amrex::Real dy) { constexpr int num_comps = MultiGrid::get_num_comps(system_type); constexpr int tilesize_x = 64; @@ -652,9 +652,9 @@ void gsrb_cached (amrex::Box const& box, amrex::Array4 const& phi_o const int tile_end_x = tile_begin_x + tilesize_array_x; const int tile_end_y = tile_begin_y + tilesize_array_y; - // make amrex::Array4 reference cached memory tile - amrex::Array4 phi_cached(phi_ptr, {tile_begin_x, tile_begin_y, 0}, - {tile_end_x, tile_end_y, 1}, num_comps); + // make Array3 reference cached memory tile + Array3 phi_cached({phi_ptr, {tile_begin_x, tile_begin_y, 0}, + {tile_end_x, tile_end_y, 1}, num_comps}); if (zero_init) { // initialize cached memory to zero @@ -668,11 +668,11 @@ void gsrb_cached (amrex::Box const& box, amrex::Array4 const& phi_o if (ilo_loop <= i && i <= ihi_loop && jlo_loop <= j && j <= jhi_loop) { for (int n=0; n const& phi_o for (int i = i_start + shift; i < i_end; i+=2) { if (system_type == 1) { gs1(i, j, 0, ilo, jlo, ihi, jhi, phi_cached, - rhs(i, j, 0, 0), acf(i, j, 0, 0), facx, facy); + rhs(i, j, 0), acf(i, j, 0), facx, facy); gs1(i, j, 1, ilo, jlo, ihi, jhi, phi_cached, - rhs(i, j, 0, 1), acf(i, j, 0, 0), facx, facy); + rhs(i, j, 1), acf(i, j, 0), facx, facy); } else if (system_type == 2) { gs2(i, j, ilo, jlo, ihi, jhi, phi_cached, - rhs(i, j, 0, 0), rhs(i, j, 0, 1), - acf(i, j, 0, 0), acf(i, j, 0, 1), facx, facy); + rhs(i, j, 0), rhs(i, j, 1), + acf(i, j, 0), acf(i, j, 1), facx, facy); } else if (system_type == 3) { gs3(i, j, 0, ilo, jlo, ihi, jhi, phi_cached, - rhs(i, j, 0, 0), facx, facy); + rhs(i, j, 0), facx, facy); } } } @@ -719,30 +719,30 @@ void gsrb_cached (amrex::Box const& box, amrex::Array4 const& phi_o // fuse with compute_residual loop and reuse phi_cached // at the cost of one extra ghost cell in each direction if (system_type == 1) { - res(i, j, 0, 0) = residual1( + res(i, j, 0) = residual1( i, j, 0, ilo, jlo, ihi, jhi, phi_cached, - rhs(i, j, 0, 0), acf(i, j, 0, 0), facx, facy); - res(i, j, 0, 1) = residual1( + rhs(i, j, 0), acf(i, j, 0), facx, facy); + res(i, j, 1) = residual1( i, j, 1, ilo, jlo, ihi, jhi, phi_cached, - rhs(i, j, 0, 1), acf(i, j, 0, 0), facx, facy); + rhs(i, j, 1), acf(i, j, 0), facx, facy); } else if (system_type == 2) { - res(i, j, 0, 0) = residual2r( + res(i, j, 0) = residual2r( i, j, ilo, jlo, ihi, jhi, phi_cached, - rhs(i, j, 0, 0), acf(i, j, 0, 0), - acf(i, j, 0, 1), facx, facy); - res(i, j, 0, 1) = residual2i( + rhs(i, j, 0), acf(i, j, 0), + acf(i, j, 1), facx, facy); + res(i, j, 1) = residual2i( i, j, ilo, jlo, ihi, jhi, phi_cached, - rhs(i, j, 0, 1), acf(i, j, 0, 0), - acf(i, j, 0, 1), facx, facy); + rhs(i, j, 1), acf(i, j, 0), + acf(i, j, 1), facx, facy); } else if (system_type == 3) { - res(i, j, 0, 0) = residual3( + res(i, j, 0) = residual3( i, j, 0, ilo, jlo, ihi, jhi, phi_cached, - rhs(i, j, 0, 0), facx, facy); + rhs(i, j, 0), facx, facy); } } for (int n=0; n const& phi_o template void gsrb_4_residual (int system_type, amrex::Box const& box, - amrex::Array4 const& phi_out, - amrex::Array4 const& rhs, - amrex::Array4 const& acf, - amrex::Array4 const& res, - amrex::Array4 const& phi_in, + Array3 const& phi_out, + Array3 const& rhs, + Array3 const& acf, + Array3 const& res, + Array3 const& phi_in, amrex::Real dx, amrex::Real dy) { // This function performs the main computation for the multigrid solver. @@ -833,9 +833,9 @@ void gsrb_4_residual (int system_type, amrex::Box const& box, hpmg::ParallelFor(to2D(box), MultiGrid::get_num_comps(system_type), [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept { if (valid_domain.contains(i,j,0)) { - phi_out(i,j,0,n) = phi_in(i,j,0,n); + phi_out(i,j,n) = phi_in(i,j,n); } else { - phi_out(i,j,0,n) = amrex::Real(0.); + phi_out(i,j,n) = amrex::Real(0.); } }); } @@ -863,9 +863,10 @@ void gsrb_4_residual (int system_type, amrex::Box const& box, #endif template -void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4 const* acf, - amrex::Array4 const* res, amrex::Array4 const* cor, - amrex::Array4 const* rescor, int nlevs, int corner_offset, +void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, Array3 const* acf, + Array3 const* res, Array3 const* cor, + Array3 const* rescor, amrex::BoxND<2> const* lev_domain, + int nlevs, int corner_offset, FGS&& fgs, FRES&& fres) { // This function performs all the operations of a vcycle in a single GPU kernel by @@ -876,47 +877,50 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4 const& item) noexcept #else - amrex::launch_global<1024><<<1, 1024, 0, amrex::Gpu::gpuStream()>>>( + amrex::launch_global<1024><<>>( [=] AMREX_GPU_DEVICE () noexcept #endif { amrex::Real facx = amrex::Real(1.)/(dx0*dx0); amrex::Real facy = amrex::Real(1.)/(dy0*dy0); - int lenx = cor[0].end.x - cor[0].begin.x - 2*corner_offset; - int leny = cor[0].end.y - cor[0].begin.y - 2*corner_offset; - int ncells = lenx*leny; + const int lenx = lev_domain[0].length(0) - 2*corner_offset; #if defined(AMREX_USE_DPCPP) const int icell = item.get_local_linear_id(); + const int n = system_type == 1 ? item.get_group_linear_id() : 0; #else const int icell = threadIdx.x; + const int n = system_type == 1 ? blockIdx.x : 0; #endif int j = icell / lenx; int i = icell - j*lenx; - j += cor[0].begin.y + corner_offset; - i += cor[0].begin.x + corner_offset; + j += lev_domain[0].smallEnd(1) + corner_offset; + i += lev_domain[0].smallEnd(0) + corner_offset; + bool is_active = true; for (int ilev = 0; ilev < nlevs-1; ++ilev) { // set phi to zero - if (icell < ncells) { - if (system_type == 1 || system_type == 2) { - cor[ilev](i,j,0,0) = amrex::Real(0.); - cor[ilev](i,j,0,1) = amrex::Real(0.); + if (is_active) { + if (system_type == 1) { + cor[ilev](i,j,n) = amrex::Real(0.); + } else if (system_type == 2) { + cor[ilev](i,j,0) = amrex::Real(0.); + cor[ilev](i,j,1) = amrex::Real(0.); } else { - cor[ilev](i,j,0,0) = amrex::Real(0.); + cor[ilev](i,j,0) = amrex::Real(0.); } } HPMG_SYNCTHREADS; // do 4 Gauss-Seidel red-black iterations for (int is = 0; is < 4; ++is) { - if (icell < ncells) { + if (is_active) { if ((i+j+is)%2 == 0) { - fgs(i, j, - cor[ilev].begin.x, cor[ilev].begin.y, - cor[ilev].end.x-1, cor[ilev].end.y-1, + fgs(i, j, n, + lev_domain[ilev].smallEnd(0), lev_domain[ilev].smallEnd(1), + lev_domain[ilev].bigEnd(0), lev_domain[ilev].bigEnd(1), cor[ilev], res[ilev], acf[ilev], facx, facy); } @@ -925,11 +929,11 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4= (lev_domain[ilev+1].smallEnd(0) + corner_offset) && + i <= (lev_domain[ilev+1].bigEnd(0) - corner_offset) && + j >= (lev_domain[ilev+1].smallEnd(1) + corner_offset) && + j <= (lev_domain[ilev+1].bigEnd(1) - corner_offset); + if (is_active) { if (corner_offset == 0) { - if (system_type == 1 || system_type == 2) { + if (system_type == 1) { + restrict_cc(i,j,n,res[ilev+1],rescor[ilev]); + } else if (system_type == 2) { restrict_cc(i,j,0,res[ilev+1],rescor[ilev]); restrict_cc(i,j,1,res[ilev+1],rescor[ilev]); } else { restrict_cc(i,j,0,res[ilev+1],rescor[ilev]); } } else { - if (system_type == 1 || system_type == 2) { - restrict_nd(i,j,0,res[ilev+1],rescor[ilev]); - restrict_nd(i,j,1,res[ilev+1],rescor[ilev]); + if (system_type == 1) { + restrict_nd(i,j,n,res[ilev+1],rescor[ilev]); + } else if (system_type == 2) { + restrict_cc(i,j,0,res[ilev+1],rescor[ilev]); + restrict_cc(i,j,1,res[ilev+1],rescor[ilev]); } else { restrict_nd(i,j,0,res[ilev+1],rescor[ilev]); } @@ -971,23 +977,25 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4=0; --ilev) { - lenx = cor[ilev].end.x - cor[ilev].begin.x - 2*corner_offset; - leny = cor[ilev].end.y - cor[ilev].begin.y - 2*corner_offset; - ncells = lenx*leny; + + is_active = + i >= (lev_domain[ilev].smallEnd(0) + corner_offset) && + i <= (lev_domain[ilev].bigEnd(0) - corner_offset) && + j >= (lev_domain[ilev].smallEnd(1) + corner_offset) && + j <= (lev_domain[ilev].bigEnd(1) - corner_offset); + facx *= amrex::Real(4.); facy *= amrex::Real(4.); - if (icell < ncells) { - j = icell / lenx; - i = icell - j*lenx; - j += cor[ilev].begin.y + corner_offset; - i += cor[ilev].begin.x + corner_offset; + if (is_active) { if (corner_offset == 0) { - if (system_type == 1 || system_type == 2) { + if (system_type == 1) { + interpadd_cc(i, j, n, cor[ilev], cor[ilev+1]); + } else if (system_type == 2) { interpadd_cc(i, j, 0, cor[ilev], cor[ilev+1]); interpadd_cc(i, j, 1, cor[ilev], cor[ilev+1]); } else { interpadd_cc(i, j, 0, cor[ilev], cor[ilev+1]); } } else { - if (system_type == 1 || system_type == 2) { + if (system_type == 1) { + interpadd_nd(i, j, n, cor[ilev], cor[ilev+1]); + } else if (system_type == 2) { interpadd_nd(i, j, 0, cor[ilev], cor[ilev+1]); interpadd_nd(i, j, 1, cor[ilev], cor[ilev+1]); } else { @@ -1029,11 +1041,11 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, amrex::Array4 0) { - m_array4.reserve(nfabvs*m_num_single_block_levels); + m_array3.reserve(nfabvs*m_num_single_block_levels); } // Allocate the proportional coefficient of the Helmholtz equation. @@ -1119,7 +1131,7 @@ MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a for (int ilev = 0; ilev < m_num_mg_levels; ++ilev) { m_acf.emplace_back(m_domain[ilev], m_num_comps_acf); if (ilev >= m_single_block_level_begin) { - m_array4.push_back(m_acf[ilev].array()); + m_array3.push_back(m_acf[ilev].array()); } } @@ -1135,7 +1147,7 @@ MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a m_res[ilev].template setVal(0); } if (ilev >= m_single_block_level_begin) { - m_array4.push_back(m_res[ilev].array()); + m_array3.push_back(m_res[ilev].array()); } } } @@ -1148,7 +1160,7 @@ MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a m_cor[ilev].template setVal(0); } if (ilev >= m_single_block_level_begin) { - m_array4.push_back(m_cor[ilev].array()); + m_array3.push_back(m_cor[ilev].array()); } } @@ -1160,17 +1172,24 @@ MultiGrid::MultiGrid (amrex::Real dx, amrex::Real dy, amrex::Box a_domain, int a m_rescor[ilev].template setVal(0); } if (ilev >= m_single_block_level_begin) { - m_array4.push_back(m_rescor[ilev].array()); + m_array3.push_back(m_rescor[ilev].array()); } } - // Copy amrex::Array4s to GPU for bottomsolve_gpu() - if (!m_array4.empty()) { - m_array4.copyToDeviceAsync(); - m_acf_a = m_array4.data(); + // Copy domain to GPU + m_lev_domain.reserve(m_num_single_block_levels); + for (int ilev = m_single_block_level_begin; ilev < m_num_mg_levels; ++ilev) { + m_lev_domain.push_back(to2D(m_domain[ilev])); + } + + // Copy Array3s to GPU for bottomsolve_gpu() + if (!m_array3.empty()) { + m_array3.copyToDeviceAsync(); + m_acf_a = m_array3.data(); m_res_a = m_acf_a + m_num_single_block_levels; m_cor_a = m_res_a + m_num_single_block_levels; m_rescor_a = m_cor_a + m_num_single_block_levels; + m_lev_domain.copyToDeviceAsync(); } } @@ -1188,8 +1207,8 @@ MultiGrid::solve1 (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs, amrex::FArrayBox afab(center_box(a_acf.box(), m_domain.front()), 1, a_acf.dataPtr()); - auto const& array_m_acf = m_acf[0].array(); - auto const& array_a_acf = afab.const_array(); + const Array3& array_m_acf = m_acf[0].array(); + const Array3& array_a_acf = afab.const_array(); hpmg::ParallelFor(to2D(m_acf[0].box()), [=] AMREX_GPU_DEVICE (int i, int j) noexcept { @@ -1210,13 +1229,13 @@ MultiGrid::solve2 (amrex::FArrayBox& sol, amrex::FArrayBox const& rhs, HIPACE_PROFILE("hpmg::MultiGrid::solve2()"); AMREX_ALWAYS_ASSERT(m_system_type == 2); - auto const& array_m_acf = m_acf[0].array(); + const Array3& array_m_acf = m_acf[0].array(); hpmg::ParallelFor(to2D(m_acf[0].box()), [=] AMREX_GPU_DEVICE (int i, int j) noexcept { - array_m_acf(i,j,0,0) = acoef_real; - array_m_acf(i,j,0,1) = acoef_imag; + array_m_acf(i,j,0) = acoef_real; + array_m_acf(i,j,1) = acoef_imag; }); average_down_acoef(); @@ -1233,15 +1252,15 @@ MultiGrid::solve2 (amrex::FArrayBox& sol, amrex::FArrayBox const& rhs, HIPACE_PROFILE("hpmg::MultiGrid::solve2()"); AMREX_ALWAYS_ASSERT(m_system_type == 2); - auto const& array_m_acf = m_acf[0].array(); + const Array3& array_m_acf = m_acf[0].array(); amrex::FArrayBox ifab(center_box(acoef_imag.box(), m_domain.front()), 1, acoef_imag.dataPtr()); - auto const& ai = ifab.const_array(); + const Array3& ai = ifab.const_array(); hpmg::ParallelFor(to2D(m_acf[0].box()), [=] AMREX_GPU_DEVICE (int i, int j) noexcept { - array_m_acf(i,j,0,0) = acoef_real; - array_m_acf(i,j,0,1) = ai(i,j,0); + array_m_acf(i,j,0) = acoef_real; + array_m_acf(i,j,1) = ai(i,j,0); }); average_down_acoef(); @@ -1258,15 +1277,15 @@ MultiGrid::solve2 (amrex::FArrayBox& sol, amrex::FArrayBox const& rhs, HIPACE_PROFILE("hpmg::MultiGrid::solve2()"); AMREX_ALWAYS_ASSERT(m_system_type == 2); - auto const& array_m_acf = m_acf[0].array(); + const Array3& array_m_acf = m_acf[0].array(); amrex::FArrayBox rfab(center_box(acoef_real.box(), m_domain.front()), 1, acoef_real.dataPtr()); - auto const& ar = rfab.const_array(); + const Array3& ar = rfab.const_array(); hpmg::ParallelFor(to2D(m_acf[0].box()), [=] AMREX_GPU_DEVICE (int i, int j) noexcept { - array_m_acf(i,j,0,0) = ar(i,j,0); - array_m_acf(i,j,0,1) = acoef_imag; + array_m_acf(i,j,0) = ar(i,j,0); + array_m_acf(i,j,1) = acoef_imag; }); average_down_acoef(); @@ -1283,17 +1302,17 @@ MultiGrid::solve2 (amrex::FArrayBox& sol, amrex::FArrayBox const& rhs, HIPACE_PROFILE("hpmg::MultiGrid::solve2()"); AMREX_ALWAYS_ASSERT(m_system_type == 2); - auto const& array_m_acf = m_acf[0].array(); + const Array3& array_m_acf = m_acf[0].array(); amrex::FArrayBox rfab(center_box(acoef_real.box(), m_domain.front()), 1, acoef_real.dataPtr()); amrex::FArrayBox ifab(center_box(acoef_imag.box(), m_domain.front()), 1, acoef_imag.dataPtr()); - auto const& ar = rfab.const_array(); - auto const& ai = ifab.const_array(); + const Array3& ar = rfab.const_array(); + const Array3& ai = ifab.const_array(); hpmg::ParallelFor(to2D(m_acf[0].box()), [=] AMREX_GPU_DEVICE (int i, int j) noexcept { - array_m_acf(i,j,0,0) = ar(i,j,0); - array_m_acf(i,j,0,1) = ai(i,j,0); + array_m_acf(i,j,0) = ar(i,j,0); + array_m_acf(i,j,1) = ai(i,j,0); }); average_down_acoef(); @@ -1338,7 +1357,7 @@ MultiGrid::solve_doit (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs, // cor = gsrb(gsrb(gsrb(gsrb(sol)))) // rescor = rhs - L(cor) gsrb_4_residual(m_system_type, m_domain[0], m_cor[0].array(), m_rhs.const_array(), - m_acf[0].const_array(), m_rescor[0].array(), m_sol.array(), m_dx, m_dy); + m_acf[0].const_array(), m_rescor[0].array(), m_sol.const_array(), m_dx, m_dy); // Reduction on the right-hand side and residual to get convergence criteria amrex::Real resnorm0, rhsnorm0; @@ -1346,12 +1365,12 @@ MultiGrid::solve_doit (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs, amrex::ReduceOps reduce_op; amrex::ReduceData reduce_data(reduce_op); using ReduceTuple = typename decltype(reduce_data)::Type; - const auto& array_res = m_rescor[0].const_array(); - const auto& array_rhs = m_rhs.const_array(); + const Array3& array_res = m_rescor[0].const_array(); + const Array3& array_rhs = m_rhs.const_array(); reduce_op.eval(valid_domain_box(m_domain[0]), m_num_comps, reduce_data, [=] AMREX_GPU_DEVICE (int i, int j, int, int n) noexcept -> ReduceTuple { - return {std::abs(array_res(i,j,0,n)), std::abs(array_rhs(i,j,0,n))}; + return {std::abs(array_res(i,j,n)), std::abs(array_rhs(i,j,n))}; }); auto hv = reduce_data.value(reduce_op); @@ -1427,12 +1446,12 @@ MultiGrid::solve_doit (amrex::FArrayBox& a_sol, amrex::FArrayBox const& a_rhs, } // copy the cor array into the sol array used for the final output - auto const& sol = m_sol.array(); - auto const& cor = m_cor[0].const_array(); + const Array3& sol = m_sol.array(); + const Array3& cor = m_cor[0].const_array(); hpmg::ParallelFor(to2D(valid_domain_box(m_domain[0])), m_num_comps, [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept { - sol(i,j,0,n) = cor(i,j,0,n); + sol(i,j,n) = cor(i,j,n); }); } @@ -1466,7 +1485,8 @@ MultiGrid::vcycle () // rescor = res - L(cor) gsrb_4_residual( m_system_type, m_domain[ilev], m_cor[ilev].array(), m_res[ilev].const_array(), - m_acf[ilev].const_array(), m_rescor[ilev].array(), {}, dx, dy); + m_acf[ilev].const_array(), m_rescor[ilev].array(), + amrex::Array4{}, dx, dy); } // interpolate residual to next level @@ -1495,12 +1515,14 @@ MultiGrid::vcycle () // sol = gsrb(gsrb(gsrb(gsrb(rescor)))) gsrb_4_residual( m_system_type, m_domain[ilev], m_sol.array(), m_rhs.const_array(), - m_acf[ilev].const_array(), {}, m_rescor[ilev].array(), dx, dy); + m_acf[ilev].const_array(), amrex::Array4{}, + m_rescor[ilev].const_array(), dx, dy); } else { // cor = gsrb(gsrb(gsrb(gsrb(rescor)))) gsrb_4_residual( m_system_type, m_domain[ilev], m_cor[ilev].array(), m_res[ilev].const_array(), - m_acf[ilev].const_array(), {}, m_rescor[ilev].array(), dx, dy); + m_acf[ilev].const_array(), amrex::Array4{}, + m_rescor[ilev].const_array(), dx, dy); } } @@ -1511,7 +1533,7 @@ MultiGrid::vcycle () // rescor = rhs - L(cor) gsrb_4_residual( m_system_type, m_domain[0], m_cor[0].array(), m_rhs.const_array(), - m_acf[0].const_array(), m_rescor[0].array(), m_sol.array(), m_dx, m_dy); + m_acf[0].const_array(), m_rescor[0].array(), m_sol.const_array(), m_dx, m_dy); #if defined(AMREX_USE_CUDA) cudaStreamEndCapture(amrex::Gpu::gpuStream(), &m_cuda_graph_vcycle[key].first); @@ -1535,79 +1557,76 @@ MultiGrid::bottomsolve () int const corner_offset = m_domain[0].cellCentered() ? 0 : 1; if (m_system_type == 1) { - bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, - corner_offset, - [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, - amrex::Array4 const& rhs, - amrex::Array4 const& acf, + bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, + m_lev_domain.data(), nlevs, corner_offset, + [] AMREX_GPU_DEVICE (int i, int j, int n, int ilo, int jlo, int ihi, int jhi, + Array3 const& phi, + Array3 const& rhs, + Array3 const& acf, amrex::Real facx, amrex::Real facy) { amrex::Real a = acf(i,j,0); - gs1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), a, facx, facy); - gs1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), a, facx, facy); + gs1(i, j, n, ilo, jlo, ihi, jhi, phi, rhs(i,j,n), a, facx, facy); }, - [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4 const& res, + [] AMREX_GPU_DEVICE (int i, int j, int n, Array3 const& res, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, - amrex::Array4 const& rhs, - amrex::Array4 const& acf, + Array3 const& phi, + Array3 const& rhs, + Array3 const& acf, amrex::Real facx, amrex::Real facy) { amrex::Real a = acf(i,j,0); - res(i,j,0,0) = residual1(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), a, - facx, facy); - res(i,j,0,1) = residual1(i, j, 1, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), a, - facx, facy); + res(i,j,n) = residual1(i, j, n, ilo, jlo, ihi, jhi, phi, rhs(i,j,n), a, + facx, facy); }); } else if (m_system_type == 2) { - bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, - corner_offset, - [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, - amrex::Array4 const& rhs, - amrex::Array4 const& acf, + bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, + m_lev_domain.data(), nlevs, corner_offset, + [] AMREX_GPU_DEVICE (int i, int j, int, int ilo, int jlo, int ihi, int jhi, + Array3 const& phi, + Array3 const& rhs, + Array3 const& acf, amrex::Real facx, amrex::Real facy) { - amrex::Real ar = acf(i,j,0,0); - amrex::Real ai = acf(i,j,0,1); - gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), rhs(i,j,0,1), ar, ai, + amrex::Real ar = acf(i,j,0); + amrex::Real ai = acf(i,j,1); + gs2(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), rhs(i,j,1), ar, ai, facx, facy); }, - [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4 const& res, + [] AMREX_GPU_DEVICE (int i, int j, int, Array3 const& res, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, - amrex::Array4 const& rhs, - amrex::Array4 const& acf, + Array3 const& phi, + Array3 const& rhs, + Array3 const& acf, amrex::Real facx, amrex::Real facy) { - amrex::Real ar = acf(i,j,0,0); - amrex::Real ai = acf(i,j,0,1); - res(i,j,0,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), ar, ai, - facx, facy); - res(i,j,0,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,1), ar, ai, - facx, facy); + amrex::Real ar = acf(i,j,0); + amrex::Real ai = acf(i,j,1); + res(i,j,0) = residual2r(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), ar, ai, + facx, facy); + res(i,j,1) = residual2i(i, j, ilo, jlo, ihi, jhi, phi, rhs(i,j,1), ar, ai, + facx, facy); }); } else { - bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, nlevs, - corner_offset, - [] AMREX_GPU_DEVICE (int i, int j, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, - amrex::Array4 const& rhs, - amrex::Array4 const&, amrex::Real facx, + bottomsolve_gpu(dx0, dy0, m_acf_a, m_res_a, m_cor_a, m_rescor_a, + m_lev_domain.data(), nlevs, corner_offset, + [] AMREX_GPU_DEVICE (int i, int j, int, int ilo, int jlo, int ihi, int jhi, + Array3 const& phi, + Array3 const& rhs, + Array3 const&, amrex::Real facx, amrex::Real facy) { - gs3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), facx, facy); + gs3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), facx, facy); }, - [] AMREX_GPU_DEVICE (int i, int j, amrex::Array4 const& res, + [] AMREX_GPU_DEVICE (int i, int j, int, Array3 const& res, int ilo, int jlo, int ihi, int jhi, - amrex::Array4 const& phi, - amrex::Array4 const& rhs, - amrex::Array4 const&, amrex::Real facx, + Array3 const& phi, + Array3 const& rhs, + Array3 const&, amrex::Real facx, amrex::Real facy) { - res(i,j,0,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0,0), - facx, facy); + res(i,j,0) = residual3(i, j, 0, ilo, jlo, ihi, jhi, phi, rhs(i,j,0), + facx, facy); }); } } else @@ -1632,7 +1651,8 @@ MultiGrid::bottomsolve () #if defined(AMREX_USE_GPU) namespace { template - void avgdown_acf (amrex::Array4 const* acf, int ncomp, int nlevels, F&& f) + void avgdown_acf (Array3 const* acf, amrex::BoxND<2> const* lev_domain, + int ncomp, int nlevels, F&& f) { #if defined(AMREX_USE_DPCPP) amrex::launch(1, 1024, amrex::Gpu::gpuStream(), @@ -1643,8 +1663,8 @@ namespace { #endif { for (int ilev = 1; ilev < nlevels; ++ilev) { - const int lenx = acf[ilev].end.x - acf[ilev].begin.x; - const int leny = acf[ilev].end.y - acf[ilev].begin.y; + const int lenx = lev_domain[ilev].length(0); + const int leny = lev_domain[ilev].length(1); const int ncells = lenx*leny; #if defined(AMREX_USE_DPCPP) for (int icell = item.get_local_range(0)*item.get_group_linear_id() @@ -1656,10 +1676,10 @@ namespace { icell < ncells; icell += stride) { int j = icell / lenx; int i = icell - j*lenx; - j += acf[ilev].begin.y; - i += acf[ilev].begin.x; + j += lev_domain[ilev].smallEnd(1); + i += lev_domain[ilev].smallEnd(0); for (int n = 0; n < ncomp; ++n) { - f(i,j,n,acf[ilev],acf[ilev-1]); + f(i,j,n,acf[ilev],acf[ilev-1],lev_domain[ilev]); } } HPMG_SYNCTHREADS; @@ -1678,8 +1698,8 @@ MultiGrid::average_down_acoef () #endif for (int ilev = 1; ilev <= m_single_block_level_begin; ++ilev) { - auto const& crse = m_acf[ilev].array(); - auto const& fine = m_acf[ilev-1].const_array(); + const Array3& crse = m_acf[ilev].array(); + const Array3& fine = m_acf[ilev-1].const_array(); if (m_domain[ilev].cellCentered()) { hpmg::ParallelFor(to2D(m_domain[ilev]), m_num_comps_acf, [=] AMREX_GPU_DEVICE (int i, int j, int n) noexcept @@ -1698,24 +1718,26 @@ MultiGrid::average_down_acoef () #if defined(AMREX_USE_GPU) if (m_num_single_block_levels > 1) { if (m_domain[0].cellCentered()) { - avgdown_acf(m_acf_a, m_num_comps_acf, m_num_single_block_levels, + avgdown_acf(m_acf_a, m_lev_domain.data(), m_num_comps_acf, m_num_single_block_levels, [] AMREX_GPU_DEVICE (int i, int j, int n, - amrex::Array4 const& crse, - amrex::Array4 const& fine) noexcept + Array3 const& crse, + Array3 const& fine, + amrex::BoxND<2> const&) noexcept { restrict_cc(i,j,n,crse,fine); }); } else { - avgdown_acf(m_acf_a, m_num_comps_acf, m_num_single_block_levels, + avgdown_acf(m_acf_a, m_lev_domain.data(), m_num_comps_acf, m_num_single_block_levels, [] AMREX_GPU_DEVICE (int i, int j, int n, - amrex::Array4 const& crse, - amrex::Array4 const& fine) noexcept + Array3 const& crse, + Array3 const& fine, + amrex::BoxND<2> const& bx) noexcept { - if (i == crse.begin.x || - j == crse.begin.y || - i == crse.end.x-1 || - j == crse.end.y-1) { - crse(i,j,0,n) = amrex::Real(0.); + if (i == bx.smallEnd(0) || + j == bx.smallEnd(1) || + i == bx.bigEnd(0) || + j == bx.bigEnd(1)) { + crse(i,j,n) = amrex::Real(0.); } else { restrict_nd(i,j,n,crse,fine); } From 1649206c999926601416ab728f83f53b00a1fc5f Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Fri, 27 Feb 2026 08:45:24 +0100 Subject: [PATCH 3/5] siampp26 --- src/mg_solver/HpMultiGrid.cpp | 4 ++-- src/utils/MultiBuffer.cpp | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/mg_solver/HpMultiGrid.cpp b/src/mg_solver/HpMultiGrid.cpp index d81f3efd98..cd8d2cbcbf 100644 --- a/src/mg_solver/HpMultiGrid.cpp +++ b/src/mg_solver/HpMultiGrid.cpp @@ -960,8 +960,8 @@ void bottomsolve_gpu (amrex::Real dx0, amrex::Real dy0, Array3 cons if (system_type == 1) { restrict_nd(i,j,n,res[ilev+1],rescor[ilev]); } else if (system_type == 2) { - restrict_cc(i,j,0,res[ilev+1],rescor[ilev]); - restrict_cc(i,j,1,res[ilev+1],rescor[ilev]); + restrict_nd(i,j,0,res[ilev+1],rescor[ilev]); + restrict_nd(i,j,1,res[ilev+1],rescor[ilev]); } else { restrict_nd(i,j,0,res[ilev+1],rescor[ilev]); } diff --git a/src/utils/MultiBuffer.cpp b/src/utils/MultiBuffer.cpp index eaae5addcf..32cf9a1055 100644 --- a/src/utils/MultiBuffer.cpp +++ b/src/utils/MultiBuffer.cpp @@ -574,6 +574,18 @@ void MultiBuffer::get_data (int slice, MultiBeam& beams, MultiLaser& laser, int } m_datanodes[slice].m_progress = comm_progress::in_use; m_datanodes[slice].m_metadata_progress = comm_progress::in_use; + + std::stringstream ss; + + ss << "STATEDATA " << amrex::second() << " " << amrex::ParallelDescriptor::MyProc() << " "; + for (int i = m_datanodes.size()-1; i>=0; --i) { + if (i != m_datanodes.size()-1) { + ss << " "; + } + ss << m_datanodes[i].m_progress; + } + ss << "\n"; + amrex::AllPrint() << ss.str(); } void MultiBuffer::put_data (int slice, MultiBeam& beams, MultiLaser& laser, int beam_slice, @@ -620,6 +632,18 @@ void MultiBuffer::put_data (int slice, MultiBeam& beams, MultiLaser& laser, int } } + std::stringstream ss; + + ss << "STATEDATA " << amrex::second() << " " << amrex::ParallelDescriptor::MyProc() << " "; + for (int i = m_datanodes.size()-1; i>=0; --i) { + if (i != m_datanodes.size()-1) { + ss << " "; + } + ss << m_datanodes[i].m_progress; + } + ss << "\n"; + amrex::AllPrint() << ss.str(); + async_progress(slice); } From c6c6fe751328a599b4bbaa5ddd102f62f00b3033 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Fri, 27 Feb 2026 08:49:13 +0100 Subject: [PATCH 4/5] revert --- src/utils/MultiBuffer.cpp | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/src/utils/MultiBuffer.cpp b/src/utils/MultiBuffer.cpp index 32cf9a1055..585d9cd649 100644 --- a/src/utils/MultiBuffer.cpp +++ b/src/utils/MultiBuffer.cpp @@ -576,16 +576,6 @@ void MultiBuffer::get_data (int slice, MultiBeam& beams, MultiLaser& laser, int m_datanodes[slice].m_metadata_progress = comm_progress::in_use; std::stringstream ss; - - ss << "STATEDATA " << amrex::second() << " " << amrex::ParallelDescriptor::MyProc() << " "; - for (int i = m_datanodes.size()-1; i>=0; --i) { - if (i != m_datanodes.size()-1) { - ss << " "; - } - ss << m_datanodes[i].m_progress; - } - ss << "\n"; - amrex::AllPrint() << ss.str(); } void MultiBuffer::put_data (int slice, MultiBeam& beams, MultiLaser& laser, int beam_slice, @@ -632,18 +622,6 @@ void MultiBuffer::put_data (int slice, MultiBeam& beams, MultiLaser& laser, int } } - std::stringstream ss; - - ss << "STATEDATA " << amrex::second() << " " << amrex::ParallelDescriptor::MyProc() << " "; - for (int i = m_datanodes.size()-1; i>=0; --i) { - if (i != m_datanodes.size()-1) { - ss << " "; - } - ss << m_datanodes[i].m_progress; - } - ss << "\n"; - amrex::AllPrint() << ss.str(); - async_progress(slice); } From 6566a7af0c8ec95287d0a7e5ecd15f72c898230b Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Fri, 27 Feb 2026 08:50:36 +0100 Subject: [PATCH 5/5] revert 2 --- src/utils/MultiBuffer.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/utils/MultiBuffer.cpp b/src/utils/MultiBuffer.cpp index 585d9cd649..eaae5addcf 100644 --- a/src/utils/MultiBuffer.cpp +++ b/src/utils/MultiBuffer.cpp @@ -574,8 +574,6 @@ void MultiBuffer::get_data (int slice, MultiBeam& beams, MultiLaser& laser, int } m_datanodes[slice].m_progress = comm_progress::in_use; m_datanodes[slice].m_metadata_progress = comm_progress::in_use; - - std::stringstream ss; } void MultiBuffer::put_data (int slice, MultiBeam& beams, MultiLaser& laser, int beam_slice,