diff --git a/CMakeLists_files.cmake b/CMakeLists_files.cmake index ad147010f22..60f7dac3269 100644 --- a/CMakeLists_files.cmake +++ b/CMakeLists_files.cmake @@ -1299,6 +1299,9 @@ if (HAVE_AVX2_EXTENSION) opm/simulators/linalg/mixed/prec.h opm/simulators/linalg/mixed/vec.h opm/simulators/linalg/mixed/wrapper.hpp + opm/simulators/linalg/mixed/MatrixWrapper.hpp + opm/simulators/linalg/mixed/PreconditionerWrapper.hpp + opm/simulators/linalg/mixed/SolverAdapter.hpp ) endif() diff --git a/opm/simulators/linalg/FlexibleSolver_impl.hpp b/opm/simulators/linalg/FlexibleSolver_impl.hpp index f37fd2a2111..51a6cdc3238 100644 --- a/opm/simulators/linalg/FlexibleSolver_impl.hpp +++ b/opm/simulators/linalg/FlexibleSolver_impl.hpp @@ -34,6 +34,7 @@ #if HAVE_AVX2_EXTENSION #include +#include #endif #include @@ -193,7 +194,7 @@ namespace Dune OPM_THROW(std::invalid_argument, "mixed-bicgstab solver not supported for single precision."); } else { const std::string prec_type = prm.get("preconditioner.type", "error"); - bool use_mixed_dilu= (prec_type=="mixed-dilu"); + bool use_mixed_dilu= (prec_type=="legacy-mixed-dilu"); using MatrixType = decltype(linearoperator_for_solver_->getmat()); linsolver_ = std::make_shared>( linearoperator_for_solver_->getmat(), @@ -202,6 +203,15 @@ namespace Dune use_mixed_dilu ); } + // mixed-solver adapter starts here + } else if (solver_type == "mixed-precision") { + linsolver_ = std::make_shared>(linearoperator_for_solver_, + scalarproduct_, + preconditioner_, + tol, // desired residual reduction factor + maxiter, // maximum number of iterations + verbosity, + comm); #endif } else if (solver_type == "loopsolver") { linsolver_ = std::make_shared>(*linearoperator_for_solver_, diff --git a/opm/simulators/linalg/StandardPreconditioners_mpi.hpp b/opm/simulators/linalg/StandardPreconditioners_mpi.hpp index b19afb13fd8..5c6e636db77 100644 --- a/opm/simulators/linalg/StandardPreconditioners_mpi.hpp +++ b/opm/simulators/linalg/StandardPreconditioners_mpi.hpp @@ -33,6 +33,8 @@ #include #include +#include + namespace Opm { @@ -163,6 +165,14 @@ struct StandardPreconditioners DUNE_UNUSED_PARAMETER(prm); return wrapBlockPreconditioner>(comm, op.getmat()); }); + F::addCreator("mixed-ilu0", [](const O& op, const P& prm, const std::function&, std::size_t, const C& comm) { + DUNE_UNUSED_PARAMETER(prm); + return wrapBlockPreconditioner>(comm, op.getmat()); + }); + F::addCreator("mixed-dilu", [](const O& op, const P& prm, const std::function&, std::size_t, const C& comm) { + DUNE_UNUSED_PARAMETER(prm); + return wrapBlockPreconditioner>(comm, op.getmat(), true); + }); F::addCreator("jac", [](const O& op, const P& prm, const std::function&, std::size_t, const C& comm) { const int n = prm.get("repeats", 1); const double w = prm.get("relaxation", 1.0); diff --git a/opm/simulators/linalg/StandardPreconditioners_serial.hpp b/opm/simulators/linalg/StandardPreconditioners_serial.hpp index 23971a75614..2e390a4e5ed 100644 --- a/opm/simulators/linalg/StandardPreconditioners_serial.hpp +++ b/opm/simulators/linalg/StandardPreconditioners_serial.hpp @@ -33,6 +33,10 @@ #include #include + +#include + + namespace Opm { template @@ -89,11 +93,19 @@ struct StandardPreconditioners>(op.getmat()); }); F::addCreator("mixed-ilu0", [](const O& op, const P& prm, const std::function&, std::size_t) { + DUNE_UNUSED_PARAMETER(prm); + return std::make_shared>(op.getmat()); + }); + F::addCreator("mixed-dilu", [](const O& op, const P& prm, const std::function&, std::size_t) { + DUNE_UNUSED_PARAMETER(prm); + return std::make_shared>(op.getmat(),true); + }); + F::addCreator("legacy-mixed-ilu0", [](const O& op, const P& prm, const std::function&, std::size_t) { DUNE_UNUSED_PARAMETER(prm); DUNE_UNUSED_PARAMETER(op); return std::make_shared>(); }); - F::addCreator("mixed-dilu", [](const O& op, const P& prm, const std::function&, std::size_t) { + F::addCreator("legacy-mixed-dilu", [](const O& op, const P& prm, const std::function&, std::size_t) { DUNE_UNUSED_PARAMETER(prm); DUNE_UNUSED_PARAMETER(op); return std::make_shared>(); diff --git a/opm/simulators/linalg/mixed/MatrixWrapper.hpp b/opm/simulators/linalg/mixed/MatrixWrapper.hpp new file mode 100644 index 00000000000..10e273a5bd5 --- /dev/null +++ b/opm/simulators/linalg/mixed/MatrixWrapper.hpp @@ -0,0 +1,85 @@ +#ifndef OPM_MIXED_MATRIX_HEADER_INCLUDED +#define OPM_MIXED_MATRIX_HEADER_INCLUDED + + +#include + + +//! @brief Wraps c-implementation of mixed-precision matrix +//! +//! @tparam Vector the block-vector used by linear operator +//! @tparam b block size +template +class MatrixWrapper +{ + public: + virtual void mv(const Vector& x, Vector& y) const; + virtual void umv(const Vector& x, Vector& y) const; + virtual void usmv(double alpha, const Vector& x, Vector& y) const; + + //! @brief constructor + //! + //! @param nrows number of block rows + //! @param nnz number of nonzero blocks + MatrixWrapper(int nrows, int nnz) + { + nnz_=nnz; + M_ = bsr_alloc(); + bsr_init(M_, nrows, nnz, b); + } + + //! @brief destructor + ~MatrixWrapper() {bsr_free(M_);} + + void update(double const *data); + + int *rowptr(){return M_->rowptr;} + int *colidx(){return M_->colidx;} + + private: + int nnz_; + bsr_matrix *M_; +}; + +template +void MatrixWrapper:: +mv(const Vector& x, Vector& y) const +{ + // mixed-precision block spmv (y = M.x) + bsr_vmspmv3(M_, &x[0][0], &y[0][0]); +} + +template +void MatrixWrapper:: +umv(const Vector& x, Vector& y) const +{ + // mixed-precision block spmv with update (y += M.x) + bsr_vmspumv3(M_, &x[0][0], &y[0][0], 1.0); +} + +template +void MatrixWrapper:: +usmv(double alpha, const Vector& x, Vector& y) const +{ + // scaled mixed-precision block spmv with update (y += alpha * M.x) + bsr_vmspumv3(M_, &x[0][0], &y[0][0], alpha); +} + +template +void MatrixWrapper:: +update(double const *data) +{ + // transpose each dense block to make them column-major + int bb=b*b; + double B[bb]; + for(int k=0;kdbl[bb*k + i] = B[i]; + } + + // downcast to single precision + bsr_downcast(M_); +} + +#endif // OPM_MIXED_MATRIX_HEADER_INCLUDED diff --git a/opm/simulators/linalg/mixed/PreconditionerWrapper.hpp b/opm/simulators/linalg/mixed/PreconditionerWrapper.hpp new file mode 100644 index 00000000000..2f91f2bc1b1 --- /dev/null +++ b/opm/simulators/linalg/mixed/PreconditionerWrapper.hpp @@ -0,0 +1,115 @@ +#ifndef OPM_MIXED_PREC_HEADER_INCLUDED +#define OPM_MIXED_PREC_HEADER_INCLUDED + +#include + +namespace Opm { + +//! @brief Wraps c-implementation of mixed-precision preconditioner +//! +//! @tparam Vector the block-vector used by linear operator +//! @tparam M block-sparse matrix type +//! @tparam X block-vector input type +//! @tparam Y block vector output type +template +class MixedPreconditioner : public Dune::PreconditionerWithUpdate +{ + public: + + using domain_type = X; + static constexpr auto block_size = domain_type::block_type::dimension; + + //! @brief constructor + //! + //! @param A block-sparse matrix + //! @param use_dilu toggle between dilu or ilu0 factorization + MixedPreconditioner(const M& A, bool use_dilu = false) + { + // Access double precision matrix data + int nrows = A.N(); + int nnz = A.nonzeroes(); + double_data_ = &A[0][0][0][0]; + prec_ = prec_alloc(); + + // allocate and initialize mixed-precision matrix + mixed_matrix_ = bsr_alloc(); + bsr_init(mixed_matrix_, nrows, nnz, block_size); + + // copy sparsity pattern from double preccision matrix + int *rows = mixed_matrix_->rowptr; + int *cols = mixed_matrix_->colidx; + + int irow = 0; + int icol = 0; + rows[0] = 0; + for(auto row=A.begin(); row!=A.end(); row++) + { + for(unsigned int i=0; igetsize(); i++) + { + cols[icol++] = row->getindexptr()[i]; + } + rows[irow+1] = rows[irow]+row->getsize(); + irow++; + } + + // allocate and initialize preconditioner + prec_init(prec_,mixed_matrix_); + + // attribute initialization + nnz_ = nnz; + use_dilu_ = use_dilu; + + // perform matrix factorization + update(); + }; + + //! @brief destructor + ~MixedPreconditioner() + { + bsr_free(mixed_matrix_); + prec_free(prec_); + } + + virtual void update() override; + virtual bool hasPerfectUpdate() const override {return true;} + virtual void pre ([[maybe_unused]] X& x, [[maybe_unused]] Y& y) override {}; + virtual void post ([[maybe_unused]] X& x) override {}; + virtual void apply ([[maybe_unused]] X& x, [[maybe_unused]] const Y& y) override; + virtual Dune::SolverCategory::Category category() const override { return Dune::SolverCategory::sequential; }; + + private: + bool use_dilu_; + double const *double_data_; + bsr_matrix *mixed_matrix_; + prec_t *prec_; + int nnz_; +}; + +template +void MixedPreconditioner:: +update () +{ + // transpose each dense block to make them column-major + int b = block_size; + int bb=b*b; + double B[bb]; + for(int k=0;kdbl[bb*k + i] = B[i]; + } + + use_dilu_ ? prec_dilu_factorize(prec_, mixed_matrix_) : prec_ilu0_factorize(prec_, mixed_matrix_); // choose dilu or ilu0 + prec_downcast(prec_); +} + +template +void MixedPreconditioner:: +apply ([[maybe_unused]] X& x, [[maybe_unused]] const Y& y) +{ + x=y; + prec_mapply3c(prec_,&x[0][0]); +} + +} +#endif // OPM_MIXED_PREC_HEADER_INCLUDED diff --git a/opm/simulators/linalg/mixed/README.md b/opm/simulators/linalg/mixed/README.md index 1e27c55ea8d..44ae8572bb1 100644 --- a/opm/simulators/linalg/mixed/README.md +++ b/opm/simulators/linalg/mixed/README.md @@ -4,16 +4,18 @@ and a highly optimized mixed-precision implementation of ILU0 and DILU precondit bicgstab. Hopefully, this will inspire the exploration of mixed-precision algorithms in OPM. -The initial implementation is specialized for 3x3 block-sparse matrices due to their -importance in reservoir simulation and restricted to serial runs. Extending the work -to parallel runs is a matter of extending OPM's parallel infrastructure and should be -relatively straight-forward. +The initial implementations are specialized for 3x3 block-sparse matrices due to their +importance in reservoir simulation. The original implementation only works in serial. +The parallel implementation wraps the optimized mixed-precision matrix-vector multiplication +and ILU0/DILU preconditioners into building blocks for the ISTL-based implementation of +bicgstab. This sacrifices some performance for improved modularity. Extending the work to +block-sparse matrices of arbitrary block size is work in progress. The mixed-precision solver is selected by the command-line options `--linear-solver=mixed-ilu0` or `--linear-solver=mixed-dilu`. The command-line option `--matrix-add-well-contributions=true` must also be set as the mixed-precision solver operates directly on block-sparse matrices, not -on linear operators as other OPM solvers do. For convenience, a wrapper similar to the one below -can be used. +on linear operators as other OPM solvers do. For convenience, a wrapper similar to the one +below can be used. ``` bash OMP_NUM_THREADS=1 mpirun -np 1 --map-by numa --bind-to core build/bin/flow \ @@ -23,3 +25,6 @@ OMP_NUM_THREADS=1 mpirun -np 1 --map-by numa --bind-to core build/bin/flow \ --linear-solver-max-iter=1024 \ $@ ``` + +To invoke the original serial implementation add the `legacy-` prefix to the mixed-precision +linear solver options. diff --git a/opm/simulators/linalg/mixed/SolverAdapter.hpp b/opm/simulators/linalg/mixed/SolverAdapter.hpp new file mode 100644 index 00000000000..0b4380f3d93 --- /dev/null +++ b/opm/simulators/linalg/mixed/SolverAdapter.hpp @@ -0,0 +1,209 @@ +#ifndef OPM_MIXED_ADAPTER_HEADER_INCLUDED +#define OPM_MIXED_ADAPTER_HEADER_INCLUDED + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + + + +namespace Dune +{ +//#include + + +//! @brief Optimized sequential scalar product. +//! +//! @tparam Vector block-vector class with data stored as contiguous double array +template +class SeqOptmizedProduct : public Dune::SeqScalarProduct +{ +public: + + // extract block size + static constexpr auto block_size = Vector::block_type::dimension; + + // Compute the dot product + virtual double dot(const Vector& vx, const Vector& vy) const override + { + // access underlying data + double const *x = &vx[0][0]; + double const *y = &vy[0][0]; + + // total array length + int NN = block_size*vx.N(); + + // unroll loop in multiples of 8 + int n=NN/8; + int N=8*n; + double agg[8]; + for(int i=0;i<8;i++) agg[i]=0.0; + for(int i=0;idot(x, x)); + } +}; + +//! @brief Generalized mixed precision operator interface +//! +//! @tparam Matrix the block-matrix used by linear operator +//! @tparam Vector the block-vector used by linear operator +//! @tparam Comm the communicator used by linear operator +template +struct MixedOperator +{ + using type = Dune::OverlappingSchwarzOperator; +}; + +//! @brief Generalized mixed precision operator interface +//! +//! @tparam Matrix the block-matrix used by linear operator +//! @tparam Vector the block-vector used by linear operator +template +struct MixedOperator +{ + using type = Dune::MatrixAdapter; +}; + + +//! @brief Wraps mixed precision +//! +//! @tparam Comm the communicator passed to FlexibleLinearSolver +//! @tparam Operator the linear operator passed to FlexibleLinearSolver +//! @tparam Vector the block vector passed to FlexibleLinearSolver +template +class MixedAdapter:public InverseOperator +{ + public: + + using AbstractPrecondType = Dune::PreconditionerWithUpdate; + using AbstractScalarProductType = Dune::ScalarProduct; + + static constexpr auto block_size = Vector::block_type::dimension; + using MixedMatrixType = MatrixWrapper; + using MixedOperatorType = MixedOperator::type; + using OptimizedProductType = SeqOptmizedProduct; + + //! @brief constructor + //! + //! @param op the linear operator (assumed double precision) + //! @param sp the scalar product + //! @param prec the preconditioner to use + //! @param reduction the reduction factor passed to the iterative solver + //! @param maxit maximum number of iterations for the linear solver + //! @param verbose verbosity level + //! @param comm the communication object. + MixedAdapter(Operator *op, + std::shared_ptr sp, + std::shared_ptr prec, + const double& tol, + const int& maxiter, + const int& verbosity, + const Comm &comm) + { + + // Access matrix data from double precision operator + auto &A = op->getmat(); + int nrows = A.N(); + int nnz = A.nonzeroes(); + double_data_ = &A[0][0][0][0]; + + //allocate mixed matrix + mixed_matrix_ = std::make_shared(nrows,nnz); + //auto &B = *mixed_matrix_; + + // copy sparsity pattern from double precision matrix + int *rows = mixed_matrix_->rowptr(); + int *cols = mixed_matrix_->colidx(); + + int irow = 0; + int icol = 0; + rows[0] = 0; + for(auto row=A.begin(); row!=A.end(); row++) + { + for(unsigned int i=0; igetsize(); i++) + { + cols[icol++] = row->getindexptr()[i]; + } + rows[irow+1] = rows[irow]+row->getsize(); + irow++; + } + + //initialize mixed operator and optimized scalar product + double_operator_ = op; + if constexpr (std::is_same_v) + { + mixed_operator_ = std::make_shared(*mixed_matrix_); + scalar_product_ = std::make_shared(); + } + else + { + mixed_operator_ = std::make_shared(*mixed_matrix_,comm); + scalar_product_ = sp; + } + + //initialize bicgstab solver from Dune + solver_ = std::make_shared>( + *mixed_operator_, + *scalar_product_, + *prec, + tol, // desired residual reduction factor + maxiter, // maximum number of iterations + verbosity); + } + + virtual void apply(Vector &x, Vector &b, InverseOperatorResult &res) override + { + //transpose dense blocks and demote to single precision + mixed_matrix_->update(double_data_); + + //apply bicgstab solver from Dune + solver_->apply(x,b,res); + } + + virtual void apply(Vector &x, Vector &b, double reduction, InverseOperatorResult &res) override + { + x=0; + b=0; + res.reduction = reduction; + OPM_THROW(std::invalid_argument, "MixedAdapter::apply(...) not implemented yet."); + } + + virtual Dune::SolverCategory::Category category() const override{return Dune::SolverCategory::overlapping;}; + + private: + using AbstractSolverType = Dune::InverseOperator; + + + Operator *double_operator_; + std::shared_ptr solver_; + std::shared_ptr mixed_operator_; + std::shared_ptr mixed_matrix_; + std::shared_ptr scalar_product_; + double const *double_data_; + +}; + +} + +#endif // OPM_MIXED_ADAPTER_HEADER_INCLUDED diff --git a/opm/simulators/linalg/mixed/bsr.c b/opm/simulators/linalg/mixed/bsr.c index ead89998575..c7de0874add 100644 --- a/opm/simulators/linalg/mixed/bsr.c +++ b/opm/simulators/linalg/mixed/bsr.c @@ -107,6 +107,46 @@ void bsr_vmspmv3(bsr_matrix *A, const double *x, double *y) } } +void bsr_vmspumv3(bsr_matrix *A, const double *x, double *y, double alpha) +{ + int nrows = A->nrows; + int *rowptr=A->rowptr; + int *colidx=A->colidx; + const float *data=A->flt; + + const int b=3; + + __m256d valpha = _mm256_set1_pd(alpha); + + __m256d mm_zeros =_mm256_setzero_pd(); + for(int i=0;inrows; diff --git a/opm/simulators/linalg/mixed/bsr.h b/opm/simulators/linalg/mixed/bsr.h index 11e72787eaf..f093ab5f557 100644 --- a/opm/simulators/linalg/mixed/bsr.h +++ b/opm/simulators/linalg/mixed/bsr.h @@ -64,6 +64,19 @@ void bsr_init(bsr_matrix *A, int nrows, int nnz, int b); * @param x Pointer to input vector. * @param y Pointer to output vector. */ +void bsr_vmspumv3(bsr_matrix *A, const double *x, double *y, double alpha); + +/** + * @brief Sparse matrix-vector multiplication in mixed precision. + * + * @note Function is specialized for 3x3 block-sparse matrices. + * @note Function uses AVX2 intrinsics. + * + * @param A Pointer to bsr matrix. + * @param x Pointer to input vector. + * @param y Pointer to output vector. + */ + void bsr_vmspmv3(bsr_matrix *A, const double *x, double *y); /** diff --git a/opm/simulators/linalg/setupPropertyTree.cpp b/opm/simulators/linalg/setupPropertyTree.cpp index 6833ca20aac..0af2e084501 100644 --- a/opm/simulators/linalg/setupPropertyTree.cpp +++ b/opm/simulators/linalg/setupPropertyTree.cpp @@ -248,6 +248,16 @@ setupPropertyTree(FlowLinearSolverParameters p, // Note: copying the parameters return setupMixedDILU(conf, p); } + // mixed-precision ILU0 + if (conf == "legacy-mixed-ilu0") { + return setupLegacyMixedILU(conf, p); + } + + // mixed-precision ILU0 + if (conf == "legacy-mixed-dilu") { + return setupLegacyMixedDILU(conf, p); + } + if (conf == "dilu") { return setupDILU(conf, p); } @@ -416,7 +426,7 @@ setupMixedILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverPa prm.put("tol", p.linear_solver_reduction_); prm.put("maxiter", p.linear_solver_maxiter_); prm.put("verbosity", p.linear_solver_verbosity_); - prm.put("solver", "mixed-bicgstab"s); + prm.put("solver", "mixed-precision"s); prm.put("preconditioner.type", "mixed-ilu0"s); return prm; } @@ -429,12 +439,39 @@ setupMixedDILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverP prm.put("tol", p.linear_solver_reduction_); prm.put("maxiter", p.linear_solver_maxiter_); prm.put("verbosity", p.linear_solver_verbosity_); - prm.put("solver", "mixed-bicgstab"s); + prm.put("solver", "mixed-precision"s); prm.put("preconditioner.type", "mixed-dilu"s); return prm; } +PropertyTree +setupLegacyMixedILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverParameters& p) +{ + using namespace std::string_literals; + PropertyTree prm; + prm.put("tol", p.linear_solver_reduction_); + prm.put("maxiter", p.linear_solver_maxiter_); + prm.put("verbosity", p.linear_solver_verbosity_); + prm.put("solver", "mixed-bicgstab"s); + prm.put("preconditioner.type", "legacy-mixed-ilu0"s); + return prm; +} + +PropertyTree +setupLegacyMixedDILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverParameters& p) +{ + using namespace std::string_literals; + PropertyTree prm; + prm.put("tol", p.linear_solver_reduction_); + prm.put("maxiter", p.linear_solver_maxiter_); + prm.put("verbosity", p.linear_solver_verbosity_); + prm.put("solver", "mixed-bicgstab"s); + prm.put("preconditioner.type", "legacy-mixed-dilu"s); + return prm; +} + + PropertyTree setupDILU([[maybe_unused]] const std::string& conf, const FlowLinearSolverParameters& p) { diff --git a/opm/simulators/linalg/setupPropertyTree.hpp b/opm/simulators/linalg/setupPropertyTree.hpp index a424a573259..0c5b206ce3e 100644 --- a/opm/simulators/linalg/setupPropertyTree.hpp +++ b/opm/simulators/linalg/setupPropertyTree.hpp @@ -38,8 +38,10 @@ PropertyTree setupCPRW(const std::string& conf, const FlowLinearSolverParameters PropertyTree setupCPR(const std::string& conf, const FlowLinearSolverParameters& p); PropertyTree setupAMG(const std::string& conf, const FlowLinearSolverParameters& p); PropertyTree setupILU(const std::string& conf, const FlowLinearSolverParameters& p); +PropertyTree setupLegacyMixedILU(const std::string& conf, const FlowLinearSolverParameters& p); PropertyTree setupMixedILU(const std::string& conf, const FlowLinearSolverParameters& p); PropertyTree setupDILU(const std::string& conf, const FlowLinearSolverParameters& p); +PropertyTree setupLegacyMixedDILU(const std::string& conf, const FlowLinearSolverParameters& p); PropertyTree setupMixedDILU(const std::string& conf, const FlowLinearSolverParameters& p); PropertyTree setupUMFPack(const std::string& conf, const FlowLinearSolverParameters& p);